pere commited on Nov 5, 2024

Commit

e299e7f

1 Parent(s): 4a55999

first

Browse files

Files changed (35) hide show

added_tokens.json +1611 -0
config.json +288 -0
distil_whisper/__init__.py +21 -0
distil_whisper/__pycache__/__init__.cpython-310.pyc +0 -0
distil_whisper/__pycache__/layers.cpython-310.pyc +0 -0
distil_whisper/__pycache__/modeling_flax_whisper.cpython-310.pyc +0 -0
distil_whisper/__pycache__/partitioner.cpython-310.pyc +0 -0
distil_whisper/__pycache__/pipeline.cpython-310.pyc +0 -0
distil_whisper/__pycache__/train_state.cpython-310.pyc +0 -0
distil_whisper/layers.py +1338 -0
distil_whisper/modeling_flax_whisper.py +2135 -0
distil_whisper/partitioner.py +965 -0
distil_whisper/pipeline.py +527 -0
distil_whisper/train_state.py +118 -0
generation_config.json +271 -0
merges.txt +0 -0
nb-distil-large-init/added_tokens.json +1611 -0
nb-distil-large-init/config.json +288 -0
nb-distil-large-init/flax_model.msgpack +3 -0
nb-distil-large-init/generation_config.json +270 -0
nb-distil-large-init/merges.txt +0 -0
nb-distil-large-init/preprocessor_config.json +14 -0
nb-distil-large-init/special_tokens_map.json +139 -0
nb-distil-large-init/tokenizer_config.json +0 -0
nb-distil-large-init/vocab.json +0 -0
preprocessor_config.json +14 -0
run_distillation.py +2156 -0
run_distillation_debug.py +2162 -0
run_distillation_nodes.py +2168 -0
run_large_training.sh +38 -0
run_large_training_debug.sh +38 -0
special_tokens_map.json +139 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

config.json ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+  "_name_or_path": "./",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.1",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

distil_whisper/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__version__ = "0.0.1"
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+from .partitioner import PjitPartitioner
+from .pipeline import FlaxWhisperPipeline
+from .train_state import InferenceState

distil_whisper/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (420 Bytes). View file

distil_whisper/__pycache__/layers.cpython-310.pyc ADDED Viewed

Binary file (41.9 kB). View file

distil_whisper/__pycache__/modeling_flax_whisper.cpython-310.pyc ADDED Viewed

Binary file (54 kB). View file

distil_whisper/__pycache__/partitioner.cpython-310.pyc ADDED Viewed

Binary file (33.3 kB). View file

distil_whisper/__pycache__/pipeline.cpython-310.pyc ADDED Viewed

Binary file (16.8 kB). View file

distil_whisper/__pycache__/train_state.cpython-310.pyc ADDED Viewed

Binary file (4.11 kB). View file

distil_whisper/layers.py ADDED Viewed

	@@ -0,0 +1,1338 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dense attention classes and mask/weighting functions."""
+# pylint: disable=attribute-defined-outside-init,g-bare-generic
+import dataclasses
+import functools
+import operator
+from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Union
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax import linen as nn
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.dtypes import promote_dtype
+from jax import lax, random
+# from flax.linen.partitioning import param_with_axes, with_sharding_constraint
+param_with_axes = nn_partitioning.param_with_axes
+with_sharding_constraint = nn_partitioning.with_sharding_constraint
+# Type annotations
+Array = jnp.ndarray
+DType = jnp.dtype
+PRNGKey = jnp.ndarray
+Shape = Iterable[int]
+Activation = Callable[..., Array]
+PrecisionLike = Union[None, str, lax.Precision, Tuple[str, str], Tuple[lax.Precision, lax.Precision]]
+DotGeneralT = Callable[..., Array]
+ConvGeneralDilatedT = Callable[..., Array]
+PaddingLike = Union[str, int, Sequence[Union[int, Tuple[int, int]]]]
+LaxPadding = Union[str, Sequence[Tuple[int, int]]]
+# Parameter initializers.
+Initializer = Callable[[PRNGKey, Shape, DType], Array]
+InitializerAxis = Union[int, Tuple[int, ...]]
+NdInitializer = Callable[[PRNGKey, Shape, DType, InitializerAxis, InitializerAxis], Array]
+default_embed_init = nn.initializers.variance_scaling(1.0, "fan_in", "normal", out_axis=0)
+# ------------------------------------------------------------------------------
+# Temporary inlined JAX N-d initializer code
+# TODO(levskaya): remove once new JAX release is out.
+# ------------------------------------------------------------------------------
+def _compute_fans(shape: jax.core.NamedShape, in_axis=-2, out_axis=-1):
+    """Inlined JAX `nn.initializer._compute_fans`."""
+    if isinstance(in_axis, int):
+        in_size = shape[in_axis]
+    else:
+        in_size = int(np.prod([shape[i] for i in in_axis]))
+    if isinstance(out_axis, int):
+        out_size = shape[out_axis]
+    else:
+        out_size = int(np.prod([shape[i] for i in out_axis]))
+    receptive_field_size = shape.total / in_size / out_size
+    fan_in = in_size * receptive_field_size
+    fan_out = out_size * receptive_field_size
+    return fan_in, fan_out
+def variance_scaling(scale, mode, distribution, in_axis=-2, out_axis=-1, dtype=jnp.float_):
+    """Inlined JAX `nn.initializer.variance_scaling`."""
+    def init(key, shape, dtype=dtype):
+        return jnp.zeros(shape, dtype=dtype)
+        dtype = jax.dtypes.canonicalize_dtype(dtype)
+        shape = jax.core.as_named_shape(shape)
+        fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+        if mode == "fan_in":
+            denominator = fan_in
+        elif mode == "fan_out":
+            denominator = fan_out
+        elif mode == "fan_avg":
+            denominator = (fan_in + fan_out) / 2
+        else:
+            raise ValueError("invalid mode for variance scaling initializer: {}".format(mode))
+        variance = jnp.array(scale / denominator, dtype=dtype)
+        if distribution == "truncated_normal":
+            # constant is stddev of standard normal truncated to (-2, 2)
+            stddev = jnp.sqrt(variance) / jnp.array(0.87962566103423978, dtype)
+            return random.truncated_normal(key, -2, 2, shape, dtype) * stddev
+        elif distribution == "normal":
+            return random.normal(key, shape, dtype) * jnp.sqrt(variance)
+        elif distribution == "uniform":
+            return random.uniform(key, shape, dtype, -1) * jnp.sqrt(3 * variance)
+        else:
+            raise ValueError("invalid distribution for variance scaling initializer: {}".format(distribution))
+    return init
+# ------------------------------------------------------------------------------
+def nd_dense_init(scale, mode, distribution):
+    """Initializer with in_axis, out_axis set at call time."""
+    def init_fn(key, shape, dtype, in_axis, out_axis):
+        fn = variance_scaling(scale, mode, distribution, in_axis, out_axis)
+        return fn(key, shape, dtype)
+    return init_fn
+def dot_product_attention(
+    query: Array,
+    key: Array,
+    value: Array,
+    bias: Optional[Array] = None,
+    dropout_rng: Optional[PRNGKey] = None,
+    dropout_rate: float = 0.0,
+    deterministic: bool = False,
+    dtype: DType = jnp.float32,
+    float32_logits: bool = False,
+):
+    """Computes dot-product attention given query, key, and value.
+    This is the core function for applying attention based on
+    https://arxiv.org/abs/1706.03762. It calculates the attention weights given
+    query and key and combines the values using the attention weights.
+    Args:
+      query: queries for calculating attention with shape of `[batch, q_length,
+        num_heads, qk_depth_per_head]`.
+      key: keys for calculating attention with shape of `[batch, kv_length,
+        num_heads, qk_depth_per_head]`.
+      value: values to be used in attention with shape of `[batch, kv_length,
+        num_heads, v_depth_per_head]`.
+      bias: bias for the attention weights. This should be broadcastable to the
+        shape `[batch, num_heads, q_length, kv_length]` This can be used for
+        incorporating causal masks, padding masks, proximity bias, etc.
+      dropout_rng: JAX PRNGKey: to be used for dropout
+      dropout_rate: dropout rate
+      deterministic: bool, deterministic or not (to apply dropout)
+      dtype: the dtype of the computation (default: float32)
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    Returns:
+      Output of shape `[batch, length, num_heads, v_depth_per_head]`.
+    """
+    assert key.ndim == query.ndim == value.ndim, "q, k, v must have same rank."
+    assert query.shape[:-3] == key.shape[:-3] == value.shape[:-3], "q, k, v batch dims must match."
+    assert query.shape[-2] == key.shape[-2] == value.shape[-2], "q, k, v num_heads must match."
+    assert key.shape[-3] == value.shape[-3], "k, v lengths must match."
+    assert query.shape[-1] == key.shape[-1], "q, k depths must match."
+    # Casting logits and softmax computation for float32 for model stability.
+    if float32_logits:
+        query = query.astype(jnp.float32)
+        key = key.astype(jnp.float32)
+    # `attn_weights`: [batch, num_heads, q_length, kv_length]
+    attn_weights = jnp.einsum("bqhd,bkhd->bhqk", query, key)
+    # Apply attention bias: masking, dropout, proximity bias, etc.
+    if bias is not None:
+        attn_weights = attn_weights + bias.astype(attn_weights.dtype)
+    # Normalize the attention weights across `kv_length` dimension.
+    attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
+    # Apply attention dropout.
+    if not deterministic and dropout_rate > 0.0:
+        keep_prob = 1.0 - dropout_rate
+        # T5 broadcasts along the "length" dim, but unclear which one that
+        # corresponds to in positional dimensions here, assuming query dim.
+        dropout_shape = list(attn_weights.shape)
+        dropout_shape[-2] = 1
+        keep = random.bernoulli(dropout_rng, keep_prob, dropout_shape)
+        keep = jnp.broadcast_to(keep, attn_weights.shape)
+        multiplier = keep.astype(attn_weights.dtype) / jnp.asarray(keep_prob, dtype=dtype)
+        attn_weights = attn_weights * multiplier
+    # Take the linear combination of `value`.
+    return jnp.einsum("bhqk,bkhd->bqhd", attn_weights, value)
+dynamic_vector_slice_in_dim = jax.vmap(lax.dynamic_slice_in_dim, in_axes=(None, 0, None, None))
+class MultiHeadDotProductAttention(nn.Module):
+    """Multi-head dot-product attention.
+    Attributes:
+      num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1])
+        should be divisible by the number of heads.
+      head_dim: dimension of each head.
+      dtype: the dtype of the computation.
+      dropout_rate: dropout rate
+      kernel_init: initializer for the kernel of the Dense layers.
+      float32_logits: bool, if True then compute logits in float32 to avoid
+        numerical issues with bfloat16.
+    """
+    num_heads: int
+    head_dim: int
+    dtype: DType = jnp.float32
+    dropout_rate: float = 0.0
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    float32_logits: bool = False  # computes logits in float32 for stability.
+    @nn.compact
+    def __call__(
+        self,
+        inputs_q: Array,
+        inputs_kv: Array,
+        mask: Optional[Array] = None,
+        bias: Optional[Array] = None,
+        *,
+        decode: bool = False,
+        deterministic: bool = False,
+    ) -> Array:
+        """Applies multi-head dot product attention on the input data.
+        Projects the inputs into multi-headed query, key, and value vectors,
+        applies dot-product attention and project the results to an output vector.
+        There are two modes: decoding and non-decoding (e.g., training). The mode is
+        determined by `decode` argument. For decoding, this method is called twice,
+        first to initialize the cache and then for an actual decoding process. The
+        two calls are differentiated by the presence of 'cached_key' in the variable
+        dict. In the cache initialization stage, the cache variables are initialized
+        as zeros and will be filled in the subsequent decoding process.
+        In the cache initialization call, `inputs_q` has a shape [batch, length,
+        q_features] and `inputs_kv`: [batch, length, kv_features]. During the
+        incremental decoding stage, query, key and value all have the shape [batch,
+        1, qkv_features] corresponding to a single step.
+        Args:
+          inputs_q: input queries of shape `[batch, q_length, q_features]`.
+          inputs_kv: key/values of shape `[batch, kv_length, kv_features]`.
+          mask: attention mask of shape `[batch, num_heads, q_length, kv_length]`.
+          bias: attention bias of shape `[batch, num_heads, q_length, kv_length]`.
+          decode: Whether to prepare and use an autoregressive cache.
+          deterministic: Disables dropout if set to True.
+        Returns:
+          output of shape `[batch, length, q_features]`.
+        """
+        projection = functools.partial(
+            DenseGeneral,
+            axis=-1,
+            features=(self.num_heads, self.head_dim),
+            kernel_axes=("embed", "heads", "kv"),
+            dtype=self.dtype,
+        )
+        # NOTE: T5 does not explicitly rescale the attention logits by
+        #       1/sqrt(depth_kq)!  This is folded into the initializers of the
+        #       linear transformations, which is equivalent under Adafactor.
+        depth_scaling = jnp.sqrt(self.head_dim).astype(self.dtype)
+        def query_init(*args):
+            return self.kernel_init(*args) / depth_scaling
+        # Project inputs_q to multi-headed q/k/v
+        # dimensions are then [batch, length, num_heads, head_dim]
+        query = projection(kernel_init=query_init, name="query")(inputs_q)
+        key = projection(kernel_init=self.kernel_init, name="key")(inputs_kv)
+        value = projection(kernel_init=self.kernel_init, name="value")(inputs_kv)
+        query = with_sharding_constraint(query, ("batch", "length", "heads", "kv"))
+        key = with_sharding_constraint(key, ("batch", "length", "heads", "kv"))
+        value = with_sharding_constraint(value, ("batch", "length", "heads", "kv"))
+        if decode:
+            # Detect if we're initializing by absence of existing cache data.
+            is_initialized = self.has_variable("cache", "cached_key")
+            # The key and value have dimension [batch, length, num_heads, head_dim],
+            # but we cache them as [batch, num_heads, head_dim, length] as a TPU
+            # fusion optimization. This also enables the "scatter via one-hot
+            # broadcast" trick, which means we do a one-hot broadcast instead of a
+            # scatter/gather operations, resulting in a 3-4x speedup in practice.
+            def swap_dims(x):
+                return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+            cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+            cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+            cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+            if is_initialized:
+                batch, num_heads, head_dim, length = cached_key.value.shape
+                # During fast autoregressive decoding, we feed one position at a time,
+                # and cache the keys and values step by step.
+                # Sanity shape check of cached key against input query.
+                expected_shape = (batch, 1, num_heads, head_dim)
+                if expected_shape != query.shape:
+                    raise ValueError(
+                        "Autoregressive cache shape error, "
+                        "expected query shape %s instead got %s." % (expected_shape, query.shape)
+                    )
+                # Create a OHE of the current index. NOTE: the index is increased below.
+                cur_index = cache_index.value
+                one_hot_indices = jax.nn.one_hot(cur_index, length, dtype=key.dtype)
+                # In order to update the key, value caches with the current key and
+                # value, we move the length axis to the back, similar to what we did for
+                # the cached ones above.
+                # Note these are currently the key and value of a single position, since
+                # we feed one position at a time.
+                one_token_key = jnp.moveaxis(key, -3, -1)
+                one_token_value = jnp.moveaxis(value, -3, -1)
+                # Update key, value caches with our new 1d spatial slices.
+                # We implement an efficient scatter into the cache via one-hot
+                # broadcast and addition.
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+                cached_key.value = key
+                cached_value.value = value
+                cache_index.value = cache_index.value + 1
+                # Move the keys and values back to their original shapes.
+                key = jnp.moveaxis(key, -1, -3)
+                value = jnp.moveaxis(value, -1, -3)
+                # Causal mask for cached decoder self-attention: our single query
+                # position should only attend to those key positions that have already
+                # been generated and cached, not the remaining zero elements.
+                mask = combine_masks(
+                    mask,
+                    jnp.broadcast_to(
+                        jnp.arange(length) <= cur_index,
+                        # (1, 1, length) represent (head dim, query length, key length)
+                        # query length is 1 because during decoding we deal with one
+                        # index.
+                        # The same mask is applied to all batch elements and heads.
+                        (batch, 1, 1, length),
+                    ),
+                )
+                # Grab the correct relative attention bias during decoding. This is
+                # only required during single step decoding.
+                if bias is not None:
+                    # The bias is a full attention matrix, but during decoding we only
+                    # have to take a slice of it.
+                    # This is equivalent to bias[..., cur_index:cur_index+1, :].
+                    bias = dynamic_vector_slice_in_dim(jnp.squeeze(bias, axis=0), jnp.reshape(cur_index, (-1)), 1, -2)
+        # Convert the boolean attention mask to an attention bias.
+        if mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                mask > 0,
+                jnp.full(mask.shape, 0.0).astype(self.dtype),
+                jnp.full(mask.shape, -1e10).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        # Add provided bias term (e.g. relative position embedding).
+        if bias is not None:
+            attention_bias = combine_biases(attention_bias, bias)
+        dropout_rng = None
+        if not deterministic and self.dropout_rate > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # Apply attention.
+        x = dot_product_attention(
+            query,
+            key,
+            value,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout_rate,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            float32_logits=self.float32_logits,
+        )
+        # Back to the original inputs dimensions.
+        out = DenseGeneral(
+            features=inputs_q.shape[-1],  # output dim is set to the input dim.
+            axis=(-2, -1),
+            kernel_init=self.kernel_init,
+            kernel_axes=("heads", "kv", "embed"),
+            dtype=self.dtype,
+            name="out",
+        )(x)
+        return out
+def _normalize_axes(axes: Iterable[int], ndim: int) -> Tuple[int]:
+    # A tuple by convention. len(axes_tuple) then also gives the rank efficiently.
+    return tuple([ax if ax >= 0 else ndim + ax for ax in axes])
+def _canonicalize_tuple(x):
+    if isinstance(x, Iterable):
+        return tuple(x)
+    else:
+        return (x,)
+# ------------------------------------------------------------------------------
+# DenseGeneral for attention layers.
+# ------------------------------------------------------------------------------
+class DenseGeneral(nn.Module):
+    """A linear transformation (without bias) with flexible axes.
+    Attributes:
+      features: tuple with numbers of output features.
+      axis: tuple with axes to apply the transformation on.
+      dtype: the dtype of the computation (default: float32).
+      kernel_init: initializer function for the weight matrix.
+    """
+    features: Union[Iterable[int], int]
+    axis: Union[Iterable[int], int] = -1
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "normal")
+    kernel_axes: Tuple[str, ...] = ()
+    use_bias: bool = True
+    bias_init: Any = nn.initializers.zeros
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a linear transformation to the inputs along multiple dimensions.
+        Args:
+          inputs: The nd-array to be transformed.
+        Returns:
+          The transformed input.
+        """
+        features = _canonicalize_tuple(self.features)
+        axis = _canonicalize_tuple(self.axis)
+        inputs = jnp.asarray(inputs, self.dtype)
+        axis = _normalize_axes(axis, inputs.ndim)
+        kernel_shape = tuple([inputs.shape[ax] for ax in axis]) + features
+        kernel_in_axis = np.arange(len(axis))
+        kernel_out_axis = np.arange(len(axis), len(axis) + len(features))
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            kernel_in_axis,
+            kernel_out_axis,
+            axes=self.kernel_axes,
+        )
+        if self.use_bias:
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                features,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        kernel = jnp.asarray(kernel, self.dtype)
+        contract_ind = tuple(range(0, len(axis)))
+        y = lax.dot_general(inputs, kernel, ((axis, contract_ind), ((), ())))
+        if self.use_bias:
+            bias = jnp.asarray(bias, self.dtype)
+            # y += jnp.reshape(bias, (1,) * (y.ndim - 1) + (-1,))
+            y += jnp.reshape(bias, (1,) * (len(features) - y.ndim) + bias.shape[:])
+        return y
+def _convert_to_activation_function(fn_or_string: Union[str, Callable]) -> Callable:
+    """Convert a string to an activation function."""
+    if fn_or_string == "linear":
+        return lambda x: x
+    elif isinstance(fn_or_string, str):
+        return getattr(nn, fn_or_string)
+    elif callable(fn_or_string):
+        return fn_or_string
+    else:
+        raise ValueError("don't know how to convert %s to an activation function" % (fn_or_string,))
+class MlpBlock(nn.Module):
+    """Transformer MLP / feed-forward block.
+    Attributes:
+      intermediate_dim: Shared dimension of hidden layers.
+      activations: Type of activations for each layer.  Each element is either
+        'linear', a string function name in flax.linen, or a function.
+      kernel_init: Kernel function, passed to the dense layers.
+      deterministic: Whether the dropout layers should be deterministic.
+      intermediate_dropout_rate: Dropout rate used after the intermediate layers.
+      dtype: Type for the dense layer.
+    """
+    intermediate_dim: int = 2048
+    activations: Sequence[Union[str, Callable]] = ("relu",)
+    kernel_init: NdInitializer = nd_dense_init(1.0, "fan_in", "truncated_normal")
+    intermediate_dropout_rate: float = 0.1
+    dtype: Any = jnp.float32
+    @nn.compact
+    def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
+        """Applies Transformer MlpBlock module."""
+        # Iterate over specified MLP input activation functions.
+        # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
+        activations = []
+        for idx, act_fn in enumerate(self.activations):
+            dense_name = "wi" if len(self.activations) == 1 else f"wi_{idx}"
+            x = DenseGeneral(
+                self.intermediate_dim,
+                dtype=self.dtype,
+                kernel_init=self.kernel_init,
+                kernel_axes=("embed", "mlp"),
+                name=dense_name,
+            )(inputs)
+            x = _convert_to_activation_function(act_fn)(x)
+            activations.append(x)
+        # Take elementwise product of above intermediate activations.
+        x = functools.reduce(operator.mul, activations)
+        # Apply dropout and final dense output projection.
+        x = nn.Dropout(rate=self.intermediate_dropout_rate, broadcast_dims=(-2,))(
+            x, deterministic=deterministic
+        )  # Broadcast along length.
+        x = with_sharding_constraint(x, ("batch", "length", "mlp"))
+        output = DenseGeneral(
+            inputs.shape[-1],
+            dtype=self.dtype,
+            kernel_init=self.kernel_init,
+            kernel_axes=("mlp", "embed"),
+            name="wo",
+        )(x)
+        return output
+class Embed(nn.Module):
+    """A parameterized function from integers [0, n) to d-dimensional vectors.
+    Attributes:
+      num_embeddings: number of embeddings.
+      features: number of feature dimensions for each embedding.
+      dtype: the dtype of the embedding vectors (default: float32).
+      embedding_init: embedding initializer.
+      one_hot: performs the gather with a one-hot contraction rather than a true
+        gather. This is currently needed for SPMD partitioning.
+    """
+    num_embeddings: int
+    features: int
+    cast_input_dtype: Optional[DType] = None
+    dtype: DType = jnp.float32
+    params_dtype: DType = jnp.float32
+    attend_dtype: Optional[DType] = None
+    embedding_init: Initializer = default_embed_init
+    one_hot: bool = True
+    embedding: Array = dataclasses.field(init=False)
+    def setup(self):
+        self.embedding = param_with_axes(
+            "embedding",
+            self.embedding_init,
+            (self.num_embeddings, self.features),
+            self.params_dtype,
+            axes=("vocab", "embed"),
+        )
+    def __call__(self, inputs: Array) -> Array:
+        """Embeds the inputs along the last dimension.
+        Args:
+          inputs: input data, all dimensions are considered batch dimensions.
+        Returns:
+          Output which is embedded input data.  The output shape follows the input,
+          with an additional `features` dimension appended.
+        """
+        if self.cast_input_dtype:
+            inputs = inputs.astype(self.cast_input_dtype)
+        if not jnp.issubdtype(inputs.dtype, jnp.integer):
+            raise ValueError("Input type must be an integer or unsigned integer.")
+        if self.one_hot:
+            iota = lax.iota(jnp.int32, self.num_embeddings)
+            one_hot = jnp.array(inputs[..., jnp.newaxis] == iota, dtype=self.dtype)
+            output = jnp.dot(one_hot, jnp.asarray(self.embedding, self.dtype))
+        else:
+            output = jnp.asarray(self.embedding, self.dtype)[inputs]
+            output = with_sharding_constraint(output, ("batch", "length", "embed"))
+        return output
+    def attend(self, query: Array) -> Array:
+        """Attend over the embedding using a query array.
+        Args:
+          query: array with last dimension equal the feature depth `features` of the
+            embedding.
+        Returns:
+          An array with final dim `num_embeddings` corresponding to the batched
+          inner-product of the array of query vectors against each embedding.
+          Commonly used for weight-sharing between embeddings and logit transform
+          in NLP models.
+        """
+        dtype = self.attend_dtype if self.attend_dtype is not None else self.dtype
+        return jnp.dot(query, jnp.asarray(self.embedding, dtype).T)
+class RelativePositionBiases(nn.Module):
+    """Adds T5-style relative positional embeddings to the attention logits.
+    Attributes:
+      num_buckets: Number of buckets to bucket distances between key and query
+        positions into.
+      max_distance: Maximum distance before everything is lumped into the last
+        distance bucket.
+      num_heads: Number of heads in the attention layer. Each head will get a
+        different relative position weighting.
+      dtype: Type of arrays through this module.
+      embedding_init: initializer for relative embedding table.
+    """
+    num_buckets: int
+    max_distance: int
+    num_heads: int
+    dtype: Any
+    embedding_init: Callable[..., Array] = nn.linear.default_embed_init
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger
+        buckets for larger absolute relative_positions.  All relative
+        positions >=max_distance  map to the same bucket.  All relative
+        positions <=-max_distance map to the same bucket.  This should allow for
+        more graceful generalization to longer sequences than the model has been
+        trained on.
+        Args:
+          relative_position: an int32 array
+          bidirectional: a boolean - whether the attention is bidirectional
+          num_buckets: an integer
+          max_distance: an integer
+        Returns:
+          a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).astype(np.int32) * num_buckets
+            n = np.abs(n)
+        else:
+            n = np.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            np.log(n.astype(np.float32) / max_exact + np.finfo(np.float32).eps)
+            / np.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(np.int32)
+        val_if_large = np.minimum(val_if_large, num_buckets - 1)
+        ret += np.where(is_small, n, val_if_large)
+        return ret
+    @nn.compact
+    def __call__(self, qlen, klen, bidirectional=True):
+        """Produce relative position embedding attention biases.
+        Args:
+          qlen: attention query length.
+          klen: attention key length.
+          bidirectional: whether to allow positive memory-query relative position
+            embeddings.
+        Returns:
+          output: `(1, len, q_len, k_len)` attention bias
+        """
+        # TODO(levskaya): should we be computing this w. numpy as a program
+        # constant?
+        context_position = np.arange(qlen, dtype=jnp.int32)[:, None]
+        memory_position = np.arange(klen, dtype=jnp.int32)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(
+            relative_position,
+            bidirectional=bidirectional,
+            num_buckets=self.num_buckets,
+            max_distance=self.max_distance,
+        )
+        relative_attention_bias = param_with_axes(
+            "rel_embedding",
+            self.embedding_init,
+            (self.num_heads, self.num_buckets),
+            jnp.float32,
+            axes=("heads", "relpos_buckets"),
+        )
+        relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
+        # Instead of using a slow gather, we create a leading-dimension one-hot
+        # array from rp_bucket and use it to perform the gather-equivalent via a
+        # contraction, i.e.:
+        # (num_head, num_buckets) x (num_buckets one-hot, qlen, klen).
+        # This is equivalent to relative_attention_bias[:, rp_bucket]
+        bcast_iota = lax.broadcasted_iota(jnp.int32, (self.num_buckets, 1, 1), 0)
+        rp_bucket_one_hot = jnp.array(rp_bucket[jnp.newaxis, ...] == bcast_iota, dtype=self.dtype)
+        # --> shape (qlen, klen, num_heads)
+        values = lax.dot_general(
+            relative_attention_bias,
+            rp_bucket_one_hot,
+            (((1,), (0,)), ((), ())),  # rhs, lhs contracting dims
+        )  # no batched dims
+        # Add a singleton batch dimension.
+        # --> shape (1, num_heads, qlen, klen)
+        return values[jnp.newaxis, ...]
+# ------------------------------------------------------------------------------
+# T5 Layernorm - no subtraction of mean or bias.
+# ------------------------------------------------------------------------------
+# class LayerNorm(nn.Module):
+#   """T5 Layer normalization operating on the last axis of the input data."""
+#   epsilon: float = 1e-6
+#   dtype: Any = jnp.float32
+#   scale_init: Initializer = nn.initializers.ones
+#   @nn.compact
+#   def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+#     """Applies layer normalization on the input."""
+#     x = jnp.asarray(x, jnp.float32)
+#     features = x.shape[-1]
+#     mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+#     y = jnp.asarray(x * lax.rsqrt(mean2 + self.epsilon), self.dtype)
+#     scale = param_with_axes(
+#         'scale', self.scale_init, (features,), jnp.float32, axes=('embed',))
+#     scale = jnp.asarray(scale, self.dtype)
+#     return y * scale
+class LayerNorm(nn.Module):
+    """Layer normalization (https://arxiv.org/abs/1607.06450).
+    Operates on the last axis of the input data.
+    It normalizes the activations of the layer for each given example in a
+    batch independently, rather than across a batch like Batch Normalization.
+    i.e. applies a transformation that maintains the mean activation within
+    each example close to 0 and the activation standard deviation close to 1.
+    Attributes:
+      epsilon: A small float added to variance to avoid dividing by zero.
+      dtype: the dtype of the computation (default: float32).
+      use_bias:  If True, bias (beta) is added.
+      use_scale: If True, multiply by scale (gamma). When the next layer is linear
+        (also e.g. nn.relu), this can be disabled since the scaling will be done
+        by the next layer.
+      bias_init: Initializer for bias, by default, zero.
+      scale_init: Initializer for scale, by default, one.
+    """
+    epsilon: float = 1e-6
+    dtype: Any = jnp.float32
+    params_dtype: DType = jnp.float32
+    use_bias: bool = True
+    use_scale: bool = True
+    bias_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.zeros
+    scale_init: Callable[[PRNGKey, Shape, Any], Array] = nn.initializers.ones
+    @nn.compact
+    def __call__(self, x):
+        """Applies layer normalization on the input.
+        Args:
+          x: the inputs
+        Returns:
+          Normalized inputs (the same shape as inputs).
+        """
+        x = jnp.asarray(x, jnp.float32)
+        features = x.shape[-1]
+        mean = jnp.mean(x, axis=-1, keepdims=True)
+        mean2 = jnp.mean(lax.square(x), axis=-1, keepdims=True)
+        var = mean2 - lax.square(mean)
+        mul = lax.rsqrt(var + self.epsilon)
+        if self.use_scale:
+            scale = param_with_axes(
+                "scale",
+                self.scale_init,
+                (features,),
+                self.params_dtype,
+                axes=("embed",),
+            )
+            mul = mul * jnp.asarray(scale, self.dtype)
+        y = (x - mean) * mul
+        if self.use_bias:
+            bias = param_with_axes("bias", self.bias_init, (features,), self.params_dtype, axes=("embed",))
+            y = y + jnp.asarray(bias, self.dtype)
+        return jnp.asarray(y, self.dtype)
+# ------------------------------------------------------------------------------
+# Mask-making utility functions.
+# ------------------------------------------------------------------------------
+def make_attention_mask(
+    query_input: Array,
+    key_input: Array,
+    pairwise_fn: Callable = jnp.multiply,
+    extra_batch_dims: int = 0,
+    dtype: DType = jnp.float32,
+) -> Array:
+    """Mask-making helper for attention weights.
+    In case of 1d inputs (i.e., `[batch, len_q]`, `[batch, len_kv]`, the
+    attention weights will be `[batch, heads, len_q, len_kv]` and this
+    function will produce `[batch, 1, len_q, len_kv]`.
+    Args:
+      query_input: a batched, flat input of query_length size
+      key_input: a batched, flat input of key_length size
+      pairwise_fn: broadcasting elementwise comparison function
+      extra_batch_dims: number of extra batch dims to add singleton axes for, none
+        by default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len_q, len_kv]` shaped mask for 1d attention.
+    """
+    # [batch, len_q, len_kv]
+    mask = pairwise_fn(
+        # [batch, len_q] -> [batch, len_q, 1]
+        jnp.expand_dims(query_input, axis=-1),
+        # [batch, len_q] -> [batch, 1, len_kv]
+        jnp.expand_dims(key_input, axis=-2),
+    )
+    # [batch, 1, len_q, len_kv]. This creates the head dim.
+    mask = jnp.expand_dims(mask, axis=-3)
+    mask = jnp.expand_dims(mask, axis=tuple(range(extra_batch_dims)))
+    return mask.astype(dtype)
+def make_causal_mask(x: Array, extra_batch_dims: int = 0, dtype: DType = jnp.float32) -> Array:
+    """Make a causal mask for self-attention.
+    In case of 1d inputs (i.e., `[batch, len]`, the self-attention weights
+    will be `[batch, heads, len, len]` and this function will produce a
+    causal mask of shape `[batch, 1, len, len]`.
+    Note that a causal mask does not depend on the values of x; it only depends on
+    the shape. If x has padding elements, they will not be treated in a special
+    manner.
+    Args:
+      x: input array of shape `[batch, len]`
+      extra_batch_dims: number of batch dims to add singleton axes for, none by
+        default
+      dtype: mask return dtype
+    Returns:
+      A `[batch, 1, len, len]` shaped causal mask for 1d attention.
+    """
+    idxs = jnp.broadcast_to(jnp.arange(x.shape[-1], dtype=jnp.int32), x.shape)
+    return make_attention_mask(idxs, idxs, jnp.greater_equal, extra_batch_dims=extra_batch_dims, dtype=dtype)
+def combine_masks(*masks: Optional[Array], dtype: DType = jnp.float32):
+    """Combine attention masks.
+    Args:
+      *masks: set of attention mask arguments to combine, some can be None.
+      dtype: final mask dtype
+    Returns:
+      Combined mask, reduced by logical and, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = jnp.logical_and(mask, other_mask)
+    return mask.astype(dtype)
+def combine_biases(*masks: Optional[Array]):
+    """Combine attention biases.
+    Args:
+      *masks: set of attention bias arguments to combine, some can be None.
+    Returns:
+      Combined mask, reduced by summation, returns None if no masks given.
+    """
+    masks = [m for m in masks if m is not None]
+    if not masks:
+        return None
+    assert all(
+        (x.ndim == masks[0].ndim for x in masks)
+    ), f"masks must have same rank: {tuple((x.ndim for x in masks))}"
+    mask, *other_masks = masks
+    for other_mask in other_masks:
+        mask = mask + other_mask
+    return mask
+def make_decoder_mask(
+    decoder_target_tokens: Array,
+    dtype: DType,
+    decoder_causal_attention: Optional[Array] = None,
+    decoder_segment_ids: Optional[Array] = None,
+) -> Array:
+    """Compute the self-attention mask for a decoder.
+    Decoder mask is formed by combining a causal mask, a padding mask and an
+    optional packing mask. If decoder_causal_attention is passed, it makes the
+    masking non-causal for positions that have value of 1.
+    A prefix LM is applied to a dataset which has a notion of "inputs" and
+    "targets", e.g., a machine translation task. The inputs and targets are
+    concatenated to form a new target. `decoder_target_tokens` is the concatenated
+    decoder output tokens.
+    The "inputs" portion of the concatenated sequence can attend to other "inputs"
+    tokens even for those at a later time steps. In order to control this
+    behavior, `decoder_causal_attention` is necessary. This is a binary mask with
+    a value of 1 indicating that the position belonged to "inputs" portion of the
+    original dataset.
+    Example:
+      Suppose we have a dataset with two examples.
+      ds = [{"inputs": [6, 7], "targets": [8]},
+            {"inputs": [3, 4], "targets": [5]}]
+      After the data preprocessing with packing, the two examples are packed into
+      one example with the following three fields (some fields are skipped for
+      simplicity).
+         decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+           decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+      decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]
+      where each array has [batch, length] shape with batch size being 1. Then,
+      this function computes the following mask.
+                        mask = [[[[1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 0, 0, 0, 0, 0],
+                                  [1, 1, 1, 0, 0, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 0, 0],
+                                  [0, 0, 0, 1, 1, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0]]]]
+      mask[b, 1, :, :] represents the mask for the example `b` in the batch.
+      Because mask is for a self-attention layer, the mask's shape is a square of
+      shape [query length, key length].
+      mask[b, 1, i, j] = 1 means that the query token at position i can attend to
+      the key token at position j.
+    Args:
+      decoder_target_tokens: decoder output tokens. [batch, length]
+      dtype: dtype of the output mask.
+      decoder_causal_attention: a binary mask indicating which position should
+        only attend to earlier positions in the sequence. Others will attend
+        bidirectionally. [batch, length]
+      decoder_segment_ids: decoder segmentation info for packed examples. [batch,
+        length]
+    Returns:
+      the combined decoder mask.
+    """
+    masks = []
+    # The same mask is applied to all attention heads. So the head dimension is 1,
+    # i.e., the mask will be broadcast along the heads dim.
+    # [batch, 1, length, length]
+    causal_mask = make_causal_mask(decoder_target_tokens, dtype=dtype)
+    # Positions with value 1 in `decoder_causal_attneition` can attend
+    # bidirectionally.
+    if decoder_causal_attention is not None:
+        # [batch, 1, length, length]
+        inputs_mask = make_attention_mask(
+            decoder_causal_attention,
+            decoder_causal_attention,
+            jnp.logical_and,
+            dtype=dtype,
+        )
+        masks.append(jnp.logical_or(causal_mask, inputs_mask).astype(dtype))
+    else:
+        masks.append(causal_mask)
+    # Padding mask.
+    masks.append(make_attention_mask(decoder_target_tokens > 0, decoder_target_tokens > 0, dtype=dtype))
+    # Packing mask
+    if decoder_segment_ids is not None:
+        masks.append(make_attention_mask(decoder_segment_ids, decoder_segment_ids, jnp.equal, dtype=dtype))
+    return combine_masks(*masks, dtype=dtype)
+def canonicalize_padding(padding: PaddingLike, rank: int) -> LaxPadding:
+    """ "Canonicalizes conv padding to a jax.lax supported format."""
+    if isinstance(padding, str):
+        return padding
+    if isinstance(padding, int):
+        return [(padding, padding)] * rank
+    if isinstance(padding, Sequence) and len(padding) == rank:
+        new_pad = []
+        for p in padding:
+            if isinstance(p, int):
+                new_pad.append((p, p))
+            elif isinstance(p, tuple) and len(p) == 2:
+                new_pad.append(p)
+            else:
+                break
+        if len(new_pad) == rank:
+            return new_pad
+    raise ValueError(
+        f"Invalid padding format: {padding}, should be str, int,"
+        f" or a sequence of len {rank} where each element is an"
+        " int or pair of ints."
+    )
+def _conv_dimension_numbers(input_shape):
+    """Computes the dimension numbers based on the input shape."""
+    ndim = len(input_shape)
+    lhs_spec = (0, ndim - 1) + tuple(range(1, ndim - 1))
+    rhs_spec = (ndim - 1, ndim - 2) + tuple(range(0, ndim - 2))
+    out_spec = lhs_spec
+    return lax.ConvDimensionNumbers(lhs_spec, rhs_spec, out_spec)
+class _Conv(nn.Module):
+    """Convolution Module wrapping `lax.conv_general_dilated[_local]`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    features: int
+    kernel_size: Sequence[int]
+    strides: Union[None, int, Sequence[int]] = 1
+    padding: PaddingLike = "SAME"
+    input_dilation: Union[None, int, Sequence[int]] = 1
+    kernel_dilation: Union[None, int, Sequence[int]] = 1
+    feature_group_count: int = 1
+    use_bias: bool = True
+    mask: Optional[Array] = None
+    dtype: Optional[DType] = None
+    params_dtype: DType = jnp.float32
+    precision: PrecisionLike = None
+    kernel_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.lecun_normal()
+    bias_init: Callable[[PRNGKey, Shape, DType], Array] = nn.initializers.zeros
+    conv_general_dilated: ConvGeneralDilatedT = lax.conv_general_dilated
+    kernel_axes: Tuple[str, ...] = ()
+    @property
+    def shared_weights(self) -> bool:  # type: ignore
+        """Defines whether weights are shared or not between different pixels.
+        Returns:
+          `True` to use shared weights in convolution (regular convolution).
+          `False` to use different weights at different pixels, a.k.a.
+          "locally connected layer", "unshared convolution", or "local convolution".
+        """
+        ...
+    @nn.compact
+    def __call__(self, inputs: Array) -> Array:
+        """Applies a (potentially unshared) convolution to the inputs.
+        Args:
+          inputs: input data with dimensions (*batch_dims, spatial_dims...,
+            features). This is the channels-last convention, i.e. NHWC for a 2d
+            convolution and NDHWC for a 3D convolution. Note: this is different from
+            the input convention used by `lax.conv_general_dilated`, which puts the
+            spatial dimensions last.
+            Note: If the input has more than 1 batch dimension, all batch dimensions
+            are flattened into a single dimension for the convolution and restored
+            before returning.  In some cases directly vmap'ing the layer may yield
+            better performance than this default flattening approach.  If the input
+            lacks a batch dimension it will be added for the convolution and removed
+            n return, an allowance made to enable writing single-example code.
+        Returns:
+          The convolved data.
+        """
+        if isinstance(self.kernel_size, int):
+            raise TypeError(
+                "Expected Conv kernel_size to be a"
+                " tuple/list of integers (eg.: [3, 3]) but got"
+                f" {self.kernel_size}."
+            )
+        else:
+            kernel_size = tuple(self.kernel_size)
+        def maybe_broadcast(x: Optional[Union[int, Sequence[int]]]) -> Tuple[int, ...]:
+            if x is None:
+                # backward compatibility with using None as sentinel for
+                # broadcast 1
+                x = 1
+            if isinstance(x, int):
+                return (x,) * len(kernel_size)
+            return tuple(x)
+        # Combine all input batch dimensions into a single leading batch axis.
+        num_batch_dimensions = inputs.ndim - (len(kernel_size) + 1)
+        if num_batch_dimensions != 1:
+            input_batch_shape = inputs.shape[:num_batch_dimensions]
+            total_batch_size = int(np.prod(input_batch_shape))
+            flat_input_shape = (total_batch_size,) + inputs.shape[num_batch_dimensions:]
+            inputs = jnp.reshape(inputs, flat_input_shape)
+        # self.strides or (1,) * (inputs.ndim - 2)
+        strides = maybe_broadcast(self.strides)
+        input_dilation = maybe_broadcast(self.input_dilation)
+        kernel_dilation = maybe_broadcast(self.kernel_dilation)
+        padding_lax = canonicalize_padding(self.padding, len(kernel_size))
+        if padding_lax == "CIRCULAR":
+            kernel_size_dilated = [(k - 1) * d + 1 for k, d in zip(kernel_size, kernel_dilation)]
+            zero_pad: List[Tuple[int, int]] = [(0, 0)]
+            pads = zero_pad + [((k - 1) // 2, k // 2) for k in kernel_size_dilated] + [(0, 0)]
+            inputs = jnp.pad(inputs, pads, mode="wrap")
+            padding_lax = "VALID"
+        elif padding_lax == "CAUSAL":
+            if len(kernel_size) != 1:
+                raise ValueError("Causal padding is only implemented for 1D convolutions.")
+            left_pad = kernel_dilation[0] * (kernel_size[0] - 1)
+            pads = [(0, 0), (left_pad, 0), (0, 0)]
+            inputs = jnp.pad(inputs, pads)
+            padding_lax = "VALID"
+        dimension_numbers = _conv_dimension_numbers(inputs.shape)
+        in_features = jnp.shape(inputs)[-1]
+        if self.shared_weights:
+            # One shared convolutional kernel for all pixels in the output.
+            assert in_features % self.feature_group_count == 0
+            kernel_shape = kernel_size + (
+                in_features // self.feature_group_count,
+                self.features,
+            )
+        else:
+            if self.feature_group_count != 1:
+                raise NotImplementedError(
+                    "`lax.conv_general_dilated_local` does not support "
+                    f"`feature_group_count != 1`, got `{self.feature_group_count}`."
+                )
+            # Need to know the spatial output shape of a standard convolution to
+            # create the unshared convolution kernel.
+            conv_output_shape = jax.eval_shape(
+                lambda lhs, rhs: self.conv_general_dilated(  # pylint: disable=g-long-lambda
+                    lhs=lhs,
+                    rhs=rhs,
+                    window_strides=strides,
+                    padding=padding_lax,
+                    dimension_numbers=dimension_numbers,
+                    lhs_dilation=input_dilation,
+                    rhs_dilation=kernel_dilation,
+                ),
+                inputs,
+                jax.ShapedArray(kernel_size + (in_features, self.features), inputs.dtype),
+            ).shape
+            # One (unshared) convolutional kernel per each pixel in the output.
+            kernel_shape = conv_output_shape[1:-1] + (
+                np.prod(kernel_size) * in_features,
+                self.features,
+            )
+        if self.mask is not None and self.mask.shape != kernel_shape:
+            raise ValueError(
+                "Mask needs to have the same shape as weights. " f"Shapes are: {self.mask.shape}, {kernel_shape}"
+            )
+        kernel = param_with_axes(
+            "kernel",
+            self.kernel_init,
+            kernel_shape,
+            self.params_dtype,
+            axes=self.kernel_axes,
+        )
+        if self.mask is not None:
+            kernel *= self.mask
+        if self.use_bias:
+            if self.shared_weights:
+                # One bias weight per output channel, shared between pixels.
+                bias_shape = (self.features,)
+            else:
+                # One bias weight per output entry, unshared betwen pixels.
+                bias_shape = conv_output_shape[1:]
+            bias = param_with_axes(
+                "bias",
+                self.bias_init,
+                bias_shape,
+                self.params_dtype,
+                axes=(self.kernel_axes[-1],),
+            )
+        else:
+            bias = None
+        inputs, kernel, bias = promote_dtype(inputs, kernel, bias, dtype=self.dtype)
+        if self.shared_weights:
+            y = self.conv_general_dilated(
+                inputs,
+                kernel,
+                strides,
+                padding_lax,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                feature_group_count=self.feature_group_count,
+                precision=self.precision,
+            )
+        else:
+            y = lax.conv_general_dilated_local(
+                lhs=inputs,
+                rhs=kernel,
+                window_strides=strides,
+                padding=padding_lax,
+                filter_shape=kernel_size,
+                lhs_dilation=input_dilation,
+                rhs_dilation=kernel_dilation,
+                dimension_numbers=dimension_numbers,
+                precision=self.precision,
+            )
+        if self.use_bias:
+            bias = bias.reshape((1,) * (y.ndim - bias.ndim) + bias.shape)
+            y += bias
+        if num_batch_dimensions != 1:
+            output_shape = input_batch_shape + y.shape[1:]
+            y = jnp.reshape(y, output_shape)
+        return y
+class Conv(_Conv):
+    """Convolution Module wrapping `lax.conv_general_dilated`.
+    Attributes:
+      features: number of convolution filters.
+      kernel_size: shape of the convolutional kernel. For 1D convolution,
+        the kernel size can be passed as an integer. For all other cases, it must
+        be a sequence of integers.
+      strides: an integer or a sequence of `n` integers, representing the
+        inter-window strides (default: 1).
+      padding: either the string `'SAME'`, the string `'VALID'`, the string
+        `'CIRCULAR'` (periodic boundary conditions), or a sequence of `n` `(low,
+        high)` integer pairs that give the padding to apply before and after each
+        spatial dimension. A single int is interpeted as applying the same padding
+        in all dims and passign a single int in a sequence causes the same padding
+        to be used on both sides. `'CAUSAL'` padding for a 1D convolution will
+        left-pad the convolution axis, resulting in same-sized output.
+      input_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of `inputs`
+        (default: 1). Convolution with input dilation `d` is equivalent to
+        transposed convolution with stride `d`.
+      kernel_dilation: an integer or a sequence of `n` integers, giving the
+        dilation factor to apply in each spatial dimension of the convolution
+        kernel (default: 1). Convolution with kernel dilation
+        is also known as 'atrous convolution'.
+      feature_group_count: integer, default 1. If specified divides the input
+        features into groups.
+      use_bias: whether to add a bias to the output (default: True).
+      mask: Optional mask for the weights during masked convolution. The mask must
+            be the same shape as the convolution weight matrix.
+      dtype: the dtype of the computation (default: infer from input and params).
+      params_dtype: the dtype passed to parameter initializers (default: float32).
+      precision: numerical precision of the computation see `jax.lax.Precision`
+        for details.
+      kernel_init: initializer for the convolutional kernel.
+      bias_init: initializer for the bias.
+    """
+    @property
+    def shared_weights(self) -> bool:
+        return True

distil_whisper/modeling_flax_whisper.py ADDED Viewed

	@@ -0,0 +1,2135 @@

+# coding=utf-8
+# Copyright 2023 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Flax whisper model."""
+import random
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.linen.partitioning import remat, scan_with_axes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+from transformers import WhisperConfig
+from transformers.generation.flax_logits_process import (
+    FlaxLogitsProcessor,
+    FlaxLogitsProcessorList,
+    FlaxWhisperTimeStampLogitsProcessor,
+)
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .layers import Conv, DenseGeneral, Embed, LayerNorm, with_sharding_constraint
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class FlaxStaticForceTokensLogitsProcessor(FlaxLogitsProcessor):
+    r"""
+    [`FlaxLogitsProcessor`] that takes a list of pairs of integers which indicates a mapping from generation indices to
+    token indices that will be forced before sampling. The processor will set their log probs to 0 and all other tokens
+    to `-inf` so that they are sampled at their corresponding index. This is a static version of the `transformers` logit
+    processor [`FlaxForceTokensLogitsProcessor`] that is compatible with sharded forced tokens.
+    Args:
+        force_token_map (`list`):
+            Map giving token ids and indices where they will be forced to be sampled.
+    """
+    def __init__(self, force_token_map):
+        # The generic `transformers` logit processor builds `force_token_array` as a dictionary - this is not a valid
+        # JAX type, and so we switch to using a JAX array instead
+        force_token_map = jnp.array(force_token_map)
+        # Converts the array of format [[index, token]] containing the tokens to be forced to an array, where the
+        # index of the array corresponds to the index of the token to be forced. For XLA compatibility,
+        # indexes without forced tokens will have a negative value. Note that the last token we ever need to force in
+        # Whisper is at position 3, so we only construct an array up to this index. The native version constructs a tensor
+        # dynamically according to the length of the `force_token_map`. Array shapes need to be concrete for XLA compatibility,
+        # so this is not permitted here.
+        force_token_array = jnp.ones(3, dtype=jnp.int32) * -1
+        for index, token in force_token_map:
+            force_token_array = force_token_array.at[index].set(token)
+        self.force_token_array = jnp.int32(force_token_array)
+    def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
+        def _force_token(generation_idx):
+            batch_size = scores.shape[0]
+            current_token = self.force_token_array[generation_idx]
+            new_scores = jnp.ones_like(scores, dtype=scores.dtype) * -float("inf")
+            updates = jnp.zeros((batch_size, 1), dtype=scores.dtype)
+            new_scores = lax.dynamic_update_slice(new_scores, updates, (0, current_token))
+            return new_scores
+        scores = lax.cond(
+            cur_len >= self.force_token_array.shape[0],
+            # If the current length is geq than the length of force_token_array, the processor does nothing.
+            lambda: scores,
+            # Otherwise, it may force a certain token.
+            lambda: lax.cond(
+                self.force_token_array[cur_len] >= 0,
+                # Only valid (positive) tokens are forced
+                lambda: _force_token(cur_len),
+                # Otherwise, the processor does nothing.
+                lambda: scores,
+            ),
+        )
+        return scores
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads (got `embed_dim`:"
+                f" {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        dense = partial(
+            DenseGeneral,
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "joined_kv"),
+        )
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = DenseGeneral(
+            self.embed_dim,
+            axis=-1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("joined_kv", "embed"),
+            use_bias=self.bias,
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"),
+                dtype="bool",
+            )
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        query_states = with_sharding_constraint(query_states, ("batch", "length", "heads", "kv"))
+        key_states = with_sharding_constraint(key_states, ("batch", "length", "heads", "kv"))
+        value_states = with_sharding_constraint(value_states, ("batch", "length", "heads", "kv"))
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                # max_length of cached_key is last dim
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[-1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        # The following code is largely copied from: https://github.com/google-research/t5x/blob/63d9addf628c6d8c547a407a32095fcb527bb20b/t5x/examples/scalable_t5/layers.py#L280-L284
+        is_initialized = self.has_variable("cache", "cached_key")
+        # The key and value have dimension [batch_size, seq_length, num_heads, head_dim],
+        # but we cache them as [batch_size, num_heads, head_dim, seq_length] as a TPU
+        # fusion optimization. This also enables the "scatter via one-hot
+        # broadcast" trick, which means we do a one-hot broadcast instead of a
+        # scatter/gather operations, resulting in a 3-4x speedup in practice.
+        def swap_dims(x):
+            return x[:-3] + tuple(x[i] for i in [-2, -1, -3])
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, swap_dims(key.shape), key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, swap_dims(value.shape), value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            batch_size, num_heads, head_dim, seq_length = cached_key.value.shape
+            # During fast autoregressive decoding, we feed one position at a time,
+            # and cache the keys and values step by step.
+            # Sanity shape check of cached key against input query.
+            num_updated_cache_vectors = query.shape[1]
+            expected_shape = (batch_size, 1, num_heads, head_dim)
+            if num_updated_cache_vectors == 1 and expected_shape != query.shape:
+                raise ValueError(
+                    "Autoregressive cache shape error, expected query shape"
+                    f" {expected_shape} instead got {query.shape}"
+                )
+            # Create a OHE of the current index. NOTE: the index is increased below.
+            cur_index = cache_index.value
+            # In order to update the key, value caches with the current key and
+            # value, we move the seq_length axis to the back, similar to what we did for
+            # the cached ones above.
+            # Note these are currently the key and value of a single position, since
+            # we feed one position at a time.
+            one_token_key = jnp.moveaxis(key, -3, -1)
+            one_token_value = jnp.moveaxis(value, -3, -1)
+            # Update key, value caches with our new 1d spatial slices.
+            # We implement an efficient scatter into the cache via one-hot
+            # broadcast and addition.
+            if num_updated_cache_vectors > 1:
+                indices = jnp.eye(num_updated_cache_vectors, seq_length)[None, None]
+                key = cached_key.value + jnp.matmul(one_token_key, indices)
+                value = cached_value.value + jnp.matmul(one_token_value, indices)
+            else:
+                one_hot_indices = jax.nn.one_hot(cur_index, seq_length, dtype=key.dtype)
+                key = cached_key.value + one_token_key * one_hot_indices
+                value = cached_value.value + one_token_value * one_hot_indices
+            cached_key.value = key
+            cached_value.value = value
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # Move the keys and values back to their original shapes.
+            key = jnp.moveaxis(key, -1, -3)
+            value = jnp.moveaxis(value, -1, -3)
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(seq_length) < cur_index + num_updated_cache_vectors,
+                (batch_size,) + (1, num_updated_cache_vectors, seq_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = DenseGeneral(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layernorm_output = self.self_attn_layer_norm(hidden_states)
+        layernorm_output = with_sharding_constraint(layernorm_output, ("batch", "length", "embed"))
+        attn_output, attn_weights = self.self_attn(hidden_states=layernorm_output, attention_mask=attention_mask)
+        attn_output = self.dropout_layer(attn_output, deterministic=deterministic)
+        attn_output = residual + attn_output
+        attn_output = with_sharding_constraint(attn_output, ("batch", "length", "embed"))
+        residual = attn_output
+        post_layer_norm = self.final_layer_norm(attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        FlaxWhisperEncoderCheckpointLayer = (
+            remat(
+                FlaxWhisperEncoderLayer,
+                static_argnums=(2, 3),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperEncoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            # nicest behaviour for scan is to let the compiler figure out the correct shapes for the hidden states
+            # so we'll just pass an empty tuple as the carry initializer and hold on to the first hidden states for later
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperEncoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.encoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxEncoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                output_attentions,
+                deterministic,
+                all_hidden_states,  # tuple intializer (or None if not using output_hidden_states)
+            )
+            # remove the scan dimension
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.encoder_layers):
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.encoder_layerdrop):  # skip the layer
+                    layer_outputs = (None, None)
+                else:
+                    layer_outputs = FlaxWhisperEncoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.self_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+        self.fc1 = DenseGeneral(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "mlp"),
+        )
+        self.fc2 = DenseGeneral(
+            self.embed_dim,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("mlp", "embed"),
+        )
+        self.final_layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+        all_hidden_states=None,  # only used when `use_scan=True` -> we have to fetch the hidden states from within the layer
+    ) -> Tuple[jnp.ndarray]:
+        if self.use_scan:
+            hidden_states = hidden_states[0]
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        residual = hidden_states
+        layer_norm_output = self.self_attn_layer_norm(hidden_states)
+        layer_norm_output = with_sharding_constraint(layer_norm_output, ("batch", "length", "embed"))
+        # Self Attention
+        self_attn_output, self_attn_weights = self.self_attn(
+            hidden_states=layer_norm_output,
+            attention_mask=attention_mask,
+            init_cache=init_cache,
+        )
+        self_attn_output = self.dropout_layer(self_attn_output, deterministic=deterministic)
+        self_attn_output = residual + self_attn_output
+        self_attn_output = with_sharding_constraint(self_attn_output, ("batch", "length", "embed"))
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = self_attn_output
+            encoder_layer_norm_output = self.encoder_attn_layer_norm(self_attn_output)
+            encoder_layer_norm_output = with_sharding_constraint(
+                encoder_layer_norm_output, ("batch", "length", "embed")
+            )
+            cross_attn_output, cross_attn_weights = self.encoder_attn(
+                hidden_states=encoder_layer_norm_output,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            cross_attn_output = self.dropout_layer(cross_attn_output, deterministic=deterministic)
+            cross_attn_output = residual + cross_attn_output
+            cross_attn_output = with_sharding_constraint(cross_attn_output, ("batch", "length", "embed"))
+        # Fully Connected
+        residual = cross_attn_output
+        post_layer_norm = self.final_layer_norm(cross_attn_output)
+        post_layer_norm = with_sharding_constraint(post_layer_norm, ("batch", "length", "embed"))
+        fc1_output = self.activation_fn(self.fc1(post_layer_norm))
+        fc1_output = self.activation_dropout_layer(fc1_output, deterministic=deterministic)
+        fc1_output = with_sharding_constraint(fc1_output, ("batch", "length", "mlp"))
+        hidden_states = self.fc2(fc1_output)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if self.use_scan:
+            if all_hidden_states is not None:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = (
+                outputs,
+                all_hidden_states,
+            )
+        return outputs
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    @nn.compact
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        FlaxWhisperDecoderCheckpointLayer = (
+            remat(
+                FlaxWhisperDecoderLayer,
+                static_argnums=(4, 5, 6),
+                prevent_cse=not self.use_scan,
+            )
+            if self.gradient_checkpointing
+            else FlaxWhisperDecoderLayer
+        )
+        if self.use_scan:
+            if output_attentions:
+                raise ValueError("Cannot use `scan` with `output_attentions` set to True")
+            input_hidden_states = hidden_states
+            hidden_states = (hidden_states,)
+            hidden_states, all_hidden_states = scan_with_axes(
+                FlaxWhisperDecoderCheckpointLayer,
+                variable_axes={"params": 0, "cache": 0},
+                split_rngs={"params": True, "dropout": True},
+                in_axes=(
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                    nn.broadcast,
+                ),
+                variable_carry="all_hidden_states",
+                length=self.config.decoder_layers,
+            )(
+                self.config,
+                dtype=self.dtype,
+                params_dtype=self.params_dtype,
+                use_scan=True,
+                name="FlaxDecoderScanLayers",
+            )(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                output_attentions,
+                deterministic,
+                all_hidden_states,
+            )
+            hidden_states = hidden_states[0]
+            if output_hidden_states:
+                # if we're using scan we'll surely be training -> return hidden states as a tensor rather than tuple
+                all_hidden_states = jnp.vstack([input_hidden_states[None, ...], all_hidden_states[0]])
+        else:
+            for layer_idx in range(self.config.decoder_layers):
+                if output_hidden_states:
+                    all_hidden_states += (hidden_states,)
+                    # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+                dropout_probability = random.uniform(0, 1)
+                if not deterministic and (dropout_probability < self.config.decoder_layerdrop):
+                    layer_outputs = (None, None, None)
+                else:
+                    layer_outputs = FlaxWhisperDecoderCheckpointLayer(
+                        self.config,
+                        dtype=self.dtype,
+                        params_dtype=self.params_dtype,
+                        name=str(layer_idx),
+                    )(
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        init_cache,
+                        output_attentions,
+                        deterministic,
+                    )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attns += (layer_outputs[1],)
+                    if encoder_hidden_states is not None:
+                        all_cross_attentions += (layer_outputs[2],)
+            # add hidden states from the last decoder layer
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+        outputs = [
+            hidden_states,
+            all_hidden_states,
+            all_self_attns,
+            all_cross_attentions,
+        ]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.conv1 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "num_mel", "embed"),
+        )
+        self.conv2 = Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("channels", "embed", "num_mel"),
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.embed_positions = Embed(
+            self.config.max_source_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-05, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (
+            self.config.num_mel_bins,
+            self.config.max_source_positions * 2,
+        ):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                " self.config.max_source_positions * 2) (got"
+                f" {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins},"
+                f" {self.config.max_source_positions * 2}))"
+            )
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "embed", "num_mel"))
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+        hidden_states = with_sharding_constraint(hidden_states, ("batch", "length", "embed"))
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        # sinusoidal positional embeddings should not be trained
+        embed_positions = jax.lax.stop_gradient(embed_positions)
+        hidden_states = hidden_states + embed_positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.embed_tokens = Embed(
+            self.config.vocab_size,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.embed_positions = Embed(
+            self.config.max_target_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+        )
+        self.layers = FlaxWhisperDecoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.layer_norm = LayerNorm(dtype=self.dtype, epsilon=1e-5, params_dtype=self.params_dtype)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            if self.use_scan:
+                hidden_states = jnp.vstack([hidden_states[:-1], last_hidden_states[None, ...]])
+            else:
+                hidden_states = hidden_states[:-1] + (last_hidden_states,)
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.decoder = FlaxWhisperDecoder(
+            self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        encoder_hidden_states = encoder_outputs[0]
+        if freeze_encoder:
+            encoder_hidden_states = jax.lax.stop_gradient(encoder_hidden_states)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def _get_encoder_module(self):
+        return self.encoder
+    def _get_decoder_module(self):
+        return self.decoder
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int, int, int] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        params_dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        # Can only use_scan=True in init if loading scanned weights -> need to handle use_scan=True and unrolled weights
+        use_scan: bool = False,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        self.use_scan = use_scan
+        self.gradient_checkpointing = gradient_checkpointing
+        module = self.module_class(
+            config=config,
+            dtype=dtype,
+            params_dtype=params_dtype,
+            use_scan=use_scan,
+            gradient_checkpointing=gradient_checkpointing,
+            **kwargs,
+        )
+        if input_shape is None:
+            input_shape = (1, config.num_mel_bins, 2 * config.max_source_positions)
+        super().__init__(
+            config,
+            module,
+            input_shape=input_shape,
+            seed=seed,
+            dtype=dtype,
+            _do_init=_do_init,
+        )
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def enable_scan(self):
+        self.use_scan = True
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_unroll_to_scan(self.params)
+    def disable_scan(self):
+        self.use_scan = False
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        init_fn = partial(self.init_weights, input_shape=self.input_shape)
+        params_shape_tree = jax.eval_shape(init_fn, self.key)
+        # get the shape of the parameters
+        self._params_shape_tree = params_shape_tree
+        # save required_params as set
+        self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+        # initialize the parameters
+        if self._is_initialized:
+            self.params = self.convert_scan_to_unroll(self.params)
+    def convert_unroll_to_scan(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of unrolled model parameters to a scanned block of model parameters. This method can be used
+        to explicitly convert the model parameters to scanned format. This returns a new `params` tree and does not
+        convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The unrolled structure for the query
+        projection params is as follows:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', '23', 'self_attn', 'q_proj')
+        This method takes each of the `q_proj` matrices for layers (0, ..., 23) and stacks them into a single 'super'
+        matrix, giving a *single* block of weights for all 24 layers compatible with the scanned model:
+            ('bert', 'encoder', 'layer', 'ScanLayers', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "unrolled" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `layer` in their key
+            if "layers/0" in k:
+                if "decoder" in k:
+                    block_prefix = "Decoder"
+                    num_hidden_layers = self.config.decoder_layers
+                else:
+                    block_prefix = "Encoder"
+                    num_hidden_layers = self.config.encoder_layers
+                # Squash the keys for the N unrolled layers into one single key:
+                # (layer/0, ..., layer/N) -> layer/FlaxScanLayers
+                scan_key = k.replace("0", f"Flax{block_prefix}ScanLayers")
+                stacked_params = []
+                # Iterate over the unrolled layers (1,...,N)
+                for i in range(num_hidden_layers):
+                    # Stack the params for the N layers into one super block
+                    # and remove the unrolled layer params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_layer = params.pop(k.replace("0", str(i)))
+                    stacked_params.append(unrolled_layer)
+                params[scan_key] = jnp.stack(stacked_params)
+        # Finally, unflatten the dict to restore the nested pytree structure
+        params = unflatten_dict(params, sep="/")
+        return params
+    def convert_scan_to_unroll(self, params: Union[Dict, FrozenDict]):
+        r"""
+        Convert a `PyTree` of scanned model parameters to an unrolled stack of model parameters. This method can be
+        used to explicitly convert the model parameters to unrolled format. This returns a new `params` tree and does
+        not convert the `params` in place.
+        To illustrate the workings of this method, take the Flax BERT model. The scanned structure for the query
+        projection (`q_proj`) params is a single, stacked matrix of parameters over all N layers:
+            ('bert', 'encoder', 'layer', 'FlaxScanLayers', 'self_attn', 'q_proj')
+        This method slices each layer of the `q_proj` scanned matrix into single, standalone layers, and replaces the
+        scanned matrix of parameteres on the fly:
+            ('bert', 'encoder', 'layer', '0', 'self_attn', 'q_proj') ('bert', 'encoder', 'layer', '1', 'self_attn',
+            'q_proj') ... ('bert', 'encoder', 'layer', 'N', 'self_attn', 'q_proj')
+        When enabling scan with _do_init=True (default), this method will be called automatically under the hood. With
+        _do_init=False, it will have to be called explicitly (see example below).
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+        Examples:
+        ```python
+        >>> from distil_whisper import FlaxWhisperForConditionalGeneration
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxWhisperModel.from_pretrained("openai/whisper-tiny.en", _do_init=False)
+        >>> # By default, the model params will be in unrolled format. To illustrate the use of this method,
+        >>> # we'll first convert to scan format and then back to unrolled
+        >>> model.enable_scan()
+        >>> params = model.convert_unroll_to_scan(params)
+        >>> # now convert back to unrolled
+        >>> model.disable_scan()
+        >>> params = model.convert_scan_to_unroll(params)
+        ```"""
+        if isinstance(params, FrozenDict):
+            params = unfreeze(params)
+        params = flatten_dict(params, sep="/")
+        keys = list(params.keys())
+        for k in keys:
+            # Identify all "scan" layers formed as part of the FlaxBertLayerCollection
+            # These params contain the identifier `FlaxScanLayers` in their key
+            if "FlaxEncoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.encoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxEncoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+            elif "FlaxDecoderScanLayers" in k:
+                # Remove the scan layer from the PyTree of params
+                scan_layer = params.pop(k)
+                # Unroll the key for the stacked scan matrix into N separate keys, indexed by layer number
+                # layer/FlaxScanLayers -> (layer/0, ..., layer/N)
+                for i in range(self.config.decoder_layers):
+                    # Unstack the params for the i-th scan layer to unrolled
+                    # and remove corresponding scan params on the fly
+                    # -> no memory overhead for conversion!
+                    unrolled_key = k.replace("FlaxDecoderScanLayers", str(i))
+                    params[unrolled_key], scan_layer = scan_layer[0], scan_layer[1:]
+        params = unflatten_dict(params, sep="/")
+        return params
+    # Copied from transformers.models.whisper.modeling_flax_whisper.FlaxWhisperPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]),
+            decoder_input_ids.shape,
+        )
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=FlaxBaseModelOutputWithPastAndCrossAttentions,
+        config_class=WhisperConfig,
+    )
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        freeze_encoder: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+@add_start_docstrings(
+    ("The bare Whisper Model transformer outputting raw hidden-states without any specific head on top."),
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    params_dtype: jnp.dtype = jnp.float32
+    module_class = FlaxWhisperModule
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    params_dtype: jnp.dtype = jnp.float32
+    use_scan: bool = False
+    gradient_checkpointing: bool = False
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(
+            config=self.config,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            use_scan=self.use_scan,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = DenseGeneral(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            params_dtype=self.params_dtype,
+            kernel_axes=("embed", "vocab"),
+        )
+    def _get_encoder_module(self):
+        return self.model.encoder
+    def _get_decoder_module(self):
+        return self.model.decoder
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        freeze_encoder: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_encoder=freeze_encoder,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        encoder_hidden_states = encoder_outputs[0]
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        def _decoder_forward(
+            module,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+            **kwargs,
+        ):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+            return lm_logits, outputs
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+        return outputs
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+        if task is not None:
+            generation_config.task = task
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+        if language is not None:
+            generation_config.language = language
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+        forced_decoder_ids = []
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def pipeline_generate(
+        self,
+        input_features,
+        forced_decoder_ids,
+        return_timestamps=False,
+        generation_config=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+        # override the generation config forced decoder ids in preference of the ones we have set
+        generation_config.forced_decoder_ids = None
+        logits_processor = FlaxLogitsProcessorList()
+        logits_processor.append(FlaxStaticForceTokensLogitsProcessor(forced_decoder_ids))
+        if hasattr(generation_config, "return_timestamps") and return_timestamps:
+            logits_processor.append(FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, 1))
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+    Transcription example:
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration,
+    WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration,
+    output_type=FlaxSeq2SeqLMOutput,
+    config_class=_CONFIG_FOR_DOC,
+)

distil_whisper/partitioner.py ADDED Viewed

	@@ -0,0 +1,965 @@

+# Copyright 2022 The T5X Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for partitioning."""
+import abc
+import collections
+import dataclasses
+import typing
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+import cached_property
+import jax
+import numpy as np
+from absl import logging
+from flax import traverse_util
+from flax.linen import partitioning as flax_partitioning
+from jax import numpy as jnp
+from jax import random
+from jax.experimental import multihost_utils
+from jax.experimental.mesh_utils import create_hybrid_device_mesh
+from jax.experimental.pjit import pjit as jax_pjit
+from jax.sharding import Mesh, PartitionSpec
+JaxDevice = Any
+TpuMesh = Tuple[int, int, int, int]  # (x, y, z, num_cores).
+OtherMesh = Tuple[int, int]
+HardwareMesh = Union[TpuMesh, OtherMesh]
+PyTreeDef = type(jax.tree_util.tree_structure(None))
+TrainState = Any
+LogicalAxisRules = Sequence[Tuple[str, Optional[str]]]
+if typing.TYPE_CHECKING:  # See b/163639353
+    cached_property = property  # pylint: disable=invalid-name
+else:
+    cached_property = cached_property.cached_property
+class AxisNames(tuple):
+    """Tuple of strings specifying name for each axis.
+    We create a separate class for this so JAX's pytree utilities can distinguish
+    it from a tuple that should be treated as a pytree, instead treating it as a
+    leaf.
+    """
+    def __new__(cls, *names):
+        return tuple.__new__(AxisNames, names)
+    def __repr__(self):
+        return "AxisNames%s" % tuple.__repr__(self)
+# pjit wrappers for cpu fallback.
+# ----------------------------------------------------------------------------
+# TODO(levskaya): This function is now no different than jax_pjit, but callers
+# currently depend on `backend` argument
+def pjit(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit."""
+    del backend
+    return jax_pjit(
+        fun,
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums=static_argnums,
+        donate_argnums=donate_argnums,
+    )
+# pjit wrappers for cpu fallback.
+# -----------------------------------------------------------------------------
+# TODO(levskaya): upstream this fallback behavior to jax pjit.
+def pjit_with_cpu_fallback(
+    fun: Callable,  # pylint: disable=g-bare-generic
+    in_axis_resources,
+    out_axis_resources,
+    static_argnums: Union[int, Sequence[int]] = (),
+    donate_argnums: Union[int, Sequence[int]] = (),
+    backend: Optional[str] = None,
+):
+    """Wrapper for pjit that calls normal jit on cpu."""
+    if jax.devices(backend)[0].platform == "cpu":
+        return jax.jit(fun, static_argnums=static_argnums, donate_argnums=donate_argnums)
+    else:
+        return jax_pjit(
+            fun,
+            in_axis_resources,
+            out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+        )
+def with_sharding_constraint(x, axis_resources):
+    """Wrapper for pjit with_sharding_constraint, no-op on cpu or outside pjit."""
+    if jax.devices()[0].platform == "cpu" or not global_mesh_defined():
+        return x
+    else:
+        return jax.experimental.pjit.with_sharding_constraint(x, axis_resources)
+# pjit Mesh creation functions.
+# -----------------------------------------------------------------------------
+def bounds_from_last_device(last_device: JaxDevice) -> HardwareMesh:
+    """Get the bound from the given last device."""
+    # Must be passed the device at the highest-coordinate corner of the
+    # relevant mesh, which is a requirement we know is satisfied by the last
+    # device in jax.devices().
+    if hasattr(last_device, "coords"):
+        x, y, z = last_device.coords
+        return x + 1, y + 1, z + 1, last_device.core_on_chip + 1
+    else:
+        # On non-TPU platforms, the "mesh" is hosts x devices per host in order
+        # to take advantage of faster within-host interconnect.
+        return jax.host_count(), jax.local_device_count()
+def get_coords(device: JaxDevice) -> HardwareMesh:
+    """Returns the coordinates of the given device."""
+    if hasattr(device, "coords"):
+        return (*device.coords, device.core_on_chip)
+    return (device.process_index, device.id % jax.local_device_count())
+def global_mesh_defined():
+    """Checks if global xmap/pjit mesh resource environment is defined."""
+    maps_env = jax.experimental.maps.thread_resources.env
+    return maps_env.physical_mesh.devices.shape != ()  # pylint: disable=g-explicit-bool-comparison
+def get_mesh(
+    model_parallel_submesh: HardwareMesh,
+    input_devices: Sequence[JaxDevice] = (),
+    input_local_devices: Sequence[JaxDevice] = (),
+    tile_by_host_if_needed: bool = True,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Construct an xmap/pjit Mesh for the given model-parallel submesh.
+    The resulting mesh has two resource axes: 'model', with the provided submesh
+    shape, and 'data', which covers the rest of the mesh.
+    Args:
+      model_parallel_submesh: a HardwareMesh spec, namely (x,y,z,core) on TPU for
+        a single model-parallel replica's "tile" in the physical device mesh. The
+        first three elements (`x`, `y`, and `z`) should be factors of the pod
+        slice; e.g., if you are using df_4x8, then `x` should be a factor of 4
+        (one of 1, 2, 4), `y` should be a factor of 8 (one of 1, 2, 4, 8), and `z`
+        must be 1, because TPU v3 slices are only 2D. `z` can be >1 for TPU v4
+        (and maybe later TPUs) that allow 3D slices. `core` is the number of cores
+        to use from each TPU node. As communication is usually fastest inside the
+        same node, if you need a tile of more than 1 core, then
+        you should first increase `core`: e.g., for TPU v3, (1,1,1,2) is better
+          than (2,1,1,1). To pick a good spec, try a few possible values until you
+          get high TPU utilization.
+      input_devices: the devices to use, will use jax.devices() if this is not
+        set.
+      input_local_devices: the local devices to use, will use jax.local_devices()
+        if this is not set.
+      tile_by_host_if_needed: JAX currently requires that the parts of any sharded
+        array that are located on one host's local devices form a single
+        contiguous slice. A best effort will be made to achieve this without
+        "tiling" the device assignment over hosts (which can reduce XLA collective
+        performance). If this flag is True, then the device assignment will be
+        tiled over hosts if necessary to satisfy this constraint and create a
+        buildable mesh; if false, mesh construction will fail instead.
+      backend: get devices from the pinned backend, if specified. This is
+        useful for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      A xmap / pjit Mesh containing the virtual device mesh with data, model axes.
+    """
+    input_devices = input_devices or jax.devices(backend)
+    input_local_devices = input_local_devices or jax.local_devices(0, backend)
+    # Sort input_devices based on coords, as backends might not return devices
+    # in order.
+    last_device = sorted(input_devices, key=get_coords)[-1]
+    last_input_local_devices = sorted(input_local_devices, key=get_coords)[-1]
+    logging.info(
+        "last device coords : %r\nlast local device coords: %r",
+        get_coords(last_device),
+        get_coords(last_input_local_devices),
+    )
+    global_hardware_mesh = bounds_from_last_device(last_device)
+    mesh_ndim = len(global_hardware_mesh)
+    local_hardware_mesh = bounds_from_last_device(last_input_local_devices)
+    mesh_err = (
+        f"each dimension of the model parallel submesh {model_parallel_submesh} "
+        "must be a factor of the corresponding dimension of the global device "
+        f"mesh {global_hardware_mesh}"
+    )
+    assert not any(g % m for g, m in zip(global_hardware_mesh, model_parallel_submesh)), mesh_err
+    assert not any(g % l for g, l in zip(global_hardware_mesh, local_hardware_mesh))
+    devices = np.empty(global_hardware_mesh, dtype=object)
+    for device in input_devices:
+        device_coords = get_coords(device)
+        devices[device_coords] = device
+    tile_by_host = tile_by_host_if_needed
+    if len(global_hardware_mesh) == 4:
+        # enable contiguous local chunks without host tiling by making Z major
+        global_hardware_mesh = typing.cast(Tuple[int, int, int, int], global_hardware_mesh)
+        model_parallel_submesh = typing.cast(Tuple[int, int, int, int], model_parallel_submesh)
+        gx, gy, gz, gc = global_hardware_mesh
+        mx, my, mz, mc = model_parallel_submesh
+        if (mx == gx > 1 and my == mz == 1) or (mx == 1 and my == gy > 1 and mz == gz > 1):
+            logging.info("ensuring YZ plane has a Z-major device order")
+            # YZ should be ZY
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gx, gz, gy, gc
+            model_parallel_submesh = mx, mz, my, mc
+            devices = devices.swapaxes(1, 2)
+            tile_by_host = False
+        if (my == gy > 1 and mx == mz == 1) or (my == 1 and mx == gx > 1 and mz == gz > 1):
+            logging.info("ensuring XZ plane has a Z-major device order")
+            # XZ should be ZX
+            assert mc == gc, (mc, gc)
+            global_hardware_mesh = gz, gy, gx, gc
+            model_parallel_submesh = mz, my, mx, mc
+            devices = devices.swapaxes(0, 2)
+            tile_by_host = False
+    if tile_by_host:
+        logging.warning(
+            "Tiling device assignment mesh by hosts, which may lead to "
+            "reduced XLA collective performance. To avoid this, modify "
+            "the model parallel submesh or run with more tasks per host."
+        )
+        tile_err = (
+            "to tile the mesh by hosts, each dimension of the model parallel "
+            "submesh must be either a factor or a multiple of the corresponding "
+            "dimension of the per-host submesh"
+        )
+        def dh_dd_mh_md(g: int, m: int, l: int) -> Tuple[int, int, int, int]:
+            """Split a global mesh dimension into four tiling components.
+            Args:
+              g: global mesh bounds dimension size
+              m: model-parallel submesh bounds dimension size
+              l: local submesh bounds dimension size
+            Returns:
+              The resulting tuple divides the dimension into the hosts component of
+              the data-parallel submesh, the devices component of the data-parallel
+              submesh, the hosts component of the model-parallel submesh, and the
+              devices component of the model-parallel submesh.
+            """
+            d = g // m
+            if m >= l:
+                assert not m % l, tile_err
+                return (d, 1, m // l, l)
+            else:
+                assert not l % m, tile_err
+                return (d // (l // m), l // m, 1, m)
+        # e.g. [(x_data_hosts, x_data_devs, x_model_hosts, x_model_devs), ...]
+        dh_dd_mh_md_tups = map(
+            dh_dd_mh_md,
+            global_hardware_mesh,
+            model_parallel_submesh,
+            local_hardware_mesh,
+        )
+        # reshape to e.g. (x_dh, x_dd, x_mh, x_md, y_dh, ...)
+        devices = devices.reshape(*(s for t in dh_dd_mh_md_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder local subgroups for ring locality
+        # Transpose to [data_host], [data_device], [model_host], [model_device]
+        # block ordering e.g. (x_dh, y_dh, ..., x_dd, y_dd, ...)
+        devices = devices.transpose(
+            *(4 * i for i in range(mesh_ndim)),
+            *(4 * i + 1 for i in range(mesh_ndim)),
+            *(4 * i + 2 for i in range(mesh_ndim)),
+            *(4 * i + 3 for i in range(mesh_ndim)),
+        )
+    else:
+        # e.g. [(x_data, x_model), (y_data, y_model), ...]
+        model_data_tups = [(g // m, m) for g, m in zip(global_hardware_mesh, model_parallel_submesh)]
+        # reshape to e.g. (x_data, x_model, y_data, y_model...)
+        devices = devices.reshape(*(s for t in model_data_tups for s in t))  # pylint: disable=g-complex-comprehension
+        # TODO(jekbradbury): reorder small subgroups for ring locality
+        # transpose to e.g. (x_data, y_data, ..., x_model, ...)
+        devices = devices.transpose(*(2 * i for i in range(mesh_ndim)), *(2 * i + 1 for i in range(mesh_ndim)))
+    # reshape to (data, model)
+    devices = devices.reshape(-1, np.prod(model_parallel_submesh))
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    logging.info("global_mesh devices shape: %s", global_mesh.devices.shape)
+    return global_mesh
+def get_cpu_mesh() -> Mesh:
+    """Trivial mesh for CPU Testing."""
+    devices = np.empty((jax.host_count(), jax.local_device_count()), dtype=object)
+    for device in jax.devices():
+        devices[device.process_index, device.id % jax.local_device_count()] = device
+    return Mesh(devices, ["data", "model"])
+def get_gpu_mesh(num_partitions: int) -> Mesh:
+    """Mesh for GPUs that preferentially places 'model' on NVLink."""
+    nvlink_size = jax.local_device_count()
+    dcn_size = jax.process_count()
+    nvlink_mp = min(num_partitions, nvlink_size)
+    nvlink_dp, extra1 = divmod(nvlink_size, nvlink_mp)
+    dcn_mp, extra2 = divmod(num_partitions, nvlink_mp)
+    assert not (
+        extra1 or extra2
+    ), "number of partitions on GPU must be a factor or multiple of the number of local devices"
+    dcn_dp = dcn_size // dcn_mp
+    devices = create_hybrid_device_mesh(
+        mesh_shape=[nvlink_dp, nvlink_mp],
+        dcn_mesh_shape=[dcn_dp, dcn_mp],
+        process_is_granule=True,
+    )
+    global_mesh = Mesh(devices, ["data", "model"])
+    logging.info("global_mesh axis_names: %s", global_mesh.axis_names)
+    logging.info("global_mesh devices: %s", global_mesh.devices)
+    return global_mesh
+def default_mesh(
+    num_partitions: int,
+    model_parallel_submesh: Optional[HardwareMesh] = None,
+    backend: Optional[str] = None,
+) -> Mesh:
+    """Attempt to return a default mesh for simple cases.
+    Args:
+      num_partitions: number of partitions to use, will be ignored if
+        model_parallel_submesh is provided.
+      model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use as
+        the model-parallel device tile.
+      backend: get devices from the pinned backend, if specified. This is useful
+        for explicitly specifying the devices other than relying on
+        jax_platform_name.
+    Returns:
+      xmap/pjit 2D Mesh with 'data', 'model' mesh axes.
+    """
+    last_device = jax.devices(backend)[-1]
+    platform = last_device.platform
+    device_kind = last_device.device_kind
+    bounds = bounds_from_last_device(last_device)
+    if model_parallel_submesh:
+        return get_mesh(model_parallel_submesh, backend=backend)
+    if platform == "cpu":
+        return get_cpu_mesh()
+    elif platform == "gpu":
+        return get_gpu_mesh(num_partitions)
+    mps = None
+    if device_kind in ("TPU v2", "TPU v3"):
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 1, 1, 2)
+        elif num_partitions == 4:
+            mps = (2, 1, 1, 2)
+        elif num_partitions == 8:
+            mps = (2, 2, 1, 2)
+        elif num_partitions == 16:
+            mps = (4, 2, 1, 2)
+    # assume the use of megacore on TPU v4
+    elif (device_kind == "TPU v4" or device_kind == "TPU v4 lite") and bounds[3] == 1:
+        if num_partitions == 1:
+            mps = (1, 1, 1, 1)
+        elif num_partitions == 2:
+            mps = (1, 2, 1, 1)
+        elif num_partitions == 4:
+            if bounds[0] >= 4:
+                mps = (4, 1, 1, 1)
+            else:
+                mps = (2, 2, 1, 1)
+        elif num_partitions == 8:
+            if bounds[2] >= 8:
+                mps = (1, 1, 8, 1)
+            else:
+                mps = (4, 2, 1, 1)
+        elif num_partitions == 16:
+            if bounds[2] >= 16:
+                mps = (1, 1, 16, 1)
+            elif bounds[0] >= 8:
+                mps = (8, 2, 1, 1)
+            elif bounds[0] >= 4:
+                mps = (4, 4, 1, 1)
+            else:
+                mps = (2, 2, 4, 1)
+    if mps is None:
+        raise ValueError(
+            "No default mesh for this configuration: specify " "config.model_parallel_submesh explicitly."
+        )
+    return get_mesh(mps, backend=backend)
+# Data chunking helper.
+# -----------------------------------------------------------------------------
+@dataclasses.dataclass
+class LocalChunkInfo:
+    # The logical slice of an array located on this host's local devices.
+    slice: Tuple[slice, ...]
+    # A unique index for this host/local chunk among chunks with the same slice.
+    replica_id: int
+class LocalChunker:
+    """Utility class to aid chunking of sharded arrays in multihost settings."""
+    def __init__(self, global_mesh: Mesh):
+        self.global_mesh = global_mesh
+        local_mesh = global_mesh.local_mesh
+        first_local_device = local_mesh.devices.reshape(-1)[0]
+        host_location = collections.OrderedDict(
+            zip(
+                global_mesh.shape.keys(),
+                list(zip(*np.nonzero(global_mesh.devices == first_local_device)))[0],
+            )
+        )
+        self.num_chunks = collections.OrderedDict()
+        self.chunk_ids = collections.OrderedDict()
+        self.mesh_axes = list(global_mesh.shape.keys())
+        for mesh_axis in self.mesh_axes:
+            num_devices_per_chunk = local_mesh.shape[mesh_axis]
+            self.num_chunks[mesh_axis] = global_mesh.shape[mesh_axis] // num_devices_per_chunk
+            self.chunk_ids[mesh_axis] = host_location[mesh_axis] // num_devices_per_chunk
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Get the local chunk info for a given array shape and sharded axes.
+        Args:
+          global_shape: the global, unsharded shape of the array to chunk.
+          mesh_axes: a sequence of names (or None) of equal rank to `global_shape`
+            that specifies which mesh dimensions the array is sharded along.
+        Returns:
+          LocalChunkInfo containing the logical slices of the array found on this
+          host's local devices, as well as the replica index for this chunk among
+          chunks with the same slice. The latter is used to determine which
+          host should write this chunk during checkpointing.
+        """
+        local_slice = [slice(None) for dim in global_shape]
+        sharded_mesh_axes = set()
+        for i, (mesh_axis, size) in enumerate(zip(mesh_axes, global_shape)):
+            if not mesh_axis:
+                continue
+            sharded_mesh_axes.add(mesh_axis)
+            if not isinstance(mesh_axis, str):
+                raise NotImplementedError("TODO(jekbradbury)")
+            chunk_id = self.chunk_ids[mesh_axis]
+            chunk_size = size // self.num_chunks[mesh_axis]
+            local_slice[i] = slice(chunk_id * chunk_size, (chunk_id + 1) * chunk_size)
+        replicated_mesh_axes = [mesh_axis for mesh_axis in self.mesh_axes if mesh_axis not in sharded_mesh_axes]
+        replica_id = 0
+        for mesh_axis in replicated_mesh_axes:
+            chunk_id = self.chunk_ids[mesh_axis]
+            replica_id = replica_id * self.num_chunks[mesh_axis] + chunk_id
+        return LocalChunkInfo(tuple(local_slice), replica_id)
+def standard_logical_axis_rules(
+    activation_partitioning_dims: int = 1,
+    parameter_partitioning_dims: int = 1,
+    additional_rules: Optional[LogicalAxisRules] = None,
+) -> LogicalAxisRules:
+    """Default sharding rules for T5X model in terms of logical axis names.
+    Args:
+      activation_partitioning_dims: enables 2-D activation sharding when set to 2.
+      parameter_partitioning_dims: enables 2-D parameter sharding when set to 2.
+      additional_rules: additional rules (a sequence of tuples) that will be
+        appended to the standard rules.
+    Returns:
+      Sequence of logical axis rules
+    """
+    logging.info(
+        "`activation_partitioning_dims` = %d, `parameter_partitioning_dims` = %d",
+        activation_partitioning_dims,
+        parameter_partitioning_dims,
+    )
+    if activation_partitioning_dims == 1 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("embed", None),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),  # joined heads+kv dim in 2D attn param layouts
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 1:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+        ]
+    elif activation_partitioning_dims == 1 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "data"),
+        ]
+    elif activation_partitioning_dims == 2 and parameter_partitioning_dims == 2:
+        rules = [
+            ("batch", "data"),
+            ("vocab", "model"),
+            ("mlp", "model"),
+            ("heads", "model"),
+            ("kv", None),
+            ("joined_kv", "model"),
+            ("embed", "model"),
+            ("embed", "data"),
+        ]
+    else:
+        raise ValueError(
+            f"`activation_partitioning_dims` = {activation_partitioning_dims} "
+            f"`parameter_partitioning_dims` = {parameter_partitioning_dims} "
+            "is not supported."
+        )
+    # Add the common rules for the replicated logical axes names.
+    replicated_rules = [
+        ("relpos_buckets", None),
+        ("abspos_buckets", None),
+        ("length", None),
+        ("layers", None),
+        ("stack", None),
+        ("mlp_activations", None),
+    ]
+    rules.extend(replicated_rules)
+    if additional_rules:
+        rules.extend(additional_rules)
+    return rules
+# NB: This needs to be top-level for the jax compilation cache.
+def _id_fn(x, ix):
+    """Identity function for copying parameters to the devices, sharded."""
+    # A pure identity such as `lambda x, *: x` can get optimized away, so we
+    # include a random.split as a cheap function that cannot be optimized away.
+    y = random.split(random.PRNGKey(jnp.array(ix, dtype=jnp.uint32)))
+    return x, y
+@dataclasses.dataclass
+class DataLayout:
+    """Represents data layout for the partitioned model."""
+    batch_size: int
+    shard_id: int
+    num_shards: int
+    is_first_host_in_replica_set: bool
+PartitionedCallable = Callable[..., Any]
+CompiledPartitionedCallable = Callable[..., Any]
+class BasePartitioner(metaclass=abc.ABCMeta):
+    """Interface for partitioning computations across hardware devices."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+    ):
+        """Configures the partitioner.
+        Args:
+          num_partitions: the number of partitions to use. Ignored if
+            `model_parallel_submesh` is provided.
+          model_parallel_submesh: 4-tuple that specifies the x,y,z,c submesh to use
+            as the model-parallel device tile. This submesh is used for the larger
+            of the two parameter dimensions, and, if 2-D activation sharding is
+            enabled, for the model dimension of activations. The rest of the mesh is
+            used for data parallelism and, if 2-D parameter sharding is enabled, the
+            other parameter dimension.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is useful
+            for explicitly specifying the devices other than relying on
+            jax_platform_name.
+        """
+        if not num_partitions and not model_parallel_submesh:
+            raise ValueError("At least one of `num_partitions` or " "`model_parallel_submesh` must be set.")
+        if model_parallel_submesh is not None and len(model_parallel_submesh) != 4:
+            logging.error(
+                (
+                    "`model_parallel_submesh` must be either None or a 4-tuple. Got"
+                    " `model_parallel_submesh`=%s. A ValueError will be raised"
+                    " beginning March 1, 2022."
+                ),
+                model_parallel_submesh,
+            )
+        if bool(num_partitions) and bool(model_parallel_submesh):
+            logging.error(
+                (
+                    "At most one of `num_partitions` or `model_parallel_submesh` can be"
+                    " set. Got `num_partitions=%s` and `model_parallel_submesh`=%s. A"
+                    " ValueError will be raised beginning March 21, 2022."
+                ),
+                num_partitions,
+                model_parallel_submesh,
+            )
+        self._num_partitions = num_partitions
+        self._model_parallel_submesh = model_parallel_submesh
+        self._params_on_devices = params_on_devices
+        self._data_axis = "data"
+        self._backend = backend
+    @property
+    def mesh(self) -> Mesh:
+        raise NotImplementedError
+    @property
+    def data_partition_spec(self) -> PartitionSpec:
+        return PartitionSpec(self._data_axis)
+    def get_data_layout(self, batch_size: Optional[int] = None, host_index: Optional[int] = None) -> DataLayout:
+        """Returns filled `DataLayout` based on the partitioned model layout.
+        Args:
+          batch_size: if set, indicates the requested batch size. The exception will
+            be raised if this batch size is not compatible with the layout. If not
+            set, the batch size is inferred from the layout.
+          host_index: indicates the host index to use for the calculations, if not
+            set - use JAX-provided one. Should be in [0, num_hosts) interval and the
+            order should match the order of corresponding CPU devices in
+            `jax.devices()`.
+        Returns:
+          Filled `DataLayout` structure.
+        """
+        if host_index is not None:
+            raise NotImplementedError("Explicit host_index is not yet implemented.")
+        if self._data_axis is None:
+            return DataLayout(
+                batch_size=batch_size,
+                shard_id=0,
+                num_shards=1,
+                is_first_host_in_replica_set=(jax.process_index() == 0),
+            )
+        mesh_size = self._local_chunker.global_mesh.shape[self._data_axis]
+        batch_size = batch_size or mesh_size
+        if batch_size % mesh_size:
+            raise ValueError(
+                f"Batch size ({batch_size}) must be divisible by corresponding " f"mesh size ({mesh_size})."
+            )
+        num_shards = self._local_chunker.num_chunks[self._data_axis]
+        if batch_size % num_shards:
+            raise ValueError(f"Batch size ({batch_size}) must be divisible by number of " f"replicas ({num_shards}).")
+        replica_id = self._local_chunker.get_local_chunk_info((batch_size,), [self._data_axis]).replica_id
+        return DataLayout(
+            batch_size=int(batch_size),
+            shard_id=int(self._local_chunker.chunk_ids[self._data_axis]),
+            num_shards=int(num_shards),
+            is_first_host_in_replica_set=(replica_id == 0),
+        )
+    def get_local_chunk_info(
+        self, global_shape: Tuple[int, ...], mesh_axes: Sequence[Optional[str]]
+    ) -> LocalChunkInfo:
+        """Returns the local chunk info for a given array shape and sharded axes."""
+        return self._local_chunker.get_local_chunk_info(global_shape, mesh_axes)
+    @property
+    def params_on_devices(self):
+        return self._params_on_devices
+    def move_params_to_devices(self, train_state: TrainState, train_state_axes: TrainState) -> TrainState:
+        """Moves the optimizer parameters to devices."""
+        p_id_fn = self.partition(
+            _id_fn,
+            in_axis_resources=(train_state_axes, None),
+            out_axis_resources=(train_state_axes, None),
+            donate_argnums=(0,),
+        )
+        if jax.config.jax_array and jax.process_count() > 1:
+            train_state = multihost_utils.host_local_array_to_global_array(train_state, self.mesh, train_state_axes)
+        train_state, _ = p_id_fn(train_state, jnp.ones((), dtype=jnp.uint32))
+        return train_state
+    @property
+    @abc.abstractmethod
+    def _local_chunker(self):
+        """Returns the chunker that matches the parameters of this partitioner."""
+        raise NotImplementedError
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        # By default, return None for the logical axes.
+        return train_state.restore_state(jax.tree_map(lambda x: None, train_state.state_dict()))
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        raise NotImplementedError
+    @abc.abstractmethod
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PartitionedCallable:
+        """Partitions the computation using partitioner-specific implementation.
+        Args:
+          fn: the function to partition.
+          in_axis_resources: Pytree of structure matching that of arguments to `fn`,
+            with all actual arguments replaced by resource assignment
+            specifications. It is also valid to specify a pytree prefix (e.g. one
+            value in place of a whole subtree), in which case the leaves get
+            broadcast to all values in that subtree.
+            The valid resource assignment specifications are:
+              `None`: in which case the value will be replicated on all devices
+              `PartitionSpec`: a tuple of length at most equal to the rank of the
+                partitioned value. Each element can be a `None`, a mesh axis or a
+                tuple of mesh axes, and specifies the set of resources assigned to
+                partition the value's dimension matching its position in the spec.
+          out_axis_resources: Like `in_axis_resources`, but specifies resource
+            assignment for function outputs.
+          static_argnums: an optional int or collection of ints that specify which
+            positional arguments to treat as static (compile-time constant) in the
+            partitioned function.
+          donate_argnums: an optional int or collection of ints that specify which
+            argument buffers are "donated" to the computation. It is safe to donate
+            argument buffers if you no longer need them once the computation has
+            finished.
+        Returns:
+          A partitioned version of the input function.
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def compile(self, partitioned_fn: PartitionedCallable, *args) -> CompiledPartitionedCallable:
+        """Compiles and returns the partitioned function, or the original.
+        Args:
+          partitioned_fn: The partitioned function.
+          *args: Sample arguments to the partitioned function matching the input
+            shapes that will be passed to the compiled function.
+        Returns:
+          The compiled function, or the original if this partitioner does not
+          support compilation.
+        """
+        raise NotImplementedError
+class PjittedFnWithContext(PartitionedCallable):
+    """Wraps pjitted function to apply the appropriate contexts."""
+    def __init__(
+        self,
+        pjitted_fn,
+        partition_mesh: Mesh,
+        logical_axis_rules: flax_partitioning.LogicalRules = (),
+    ):
+        self._pjitted_fn = pjitted_fn
+        self._mesh = partition_mesh
+        self._logical_axis_rules = logical_axis_rules
+    def __call__(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn(*args)
+    def lower(self, *args):
+        with Mesh(self._mesh.devices, self._mesh.axis_names), flax_partitioning.axis_rules(self._logical_axis_rules):
+            return self._pjitted_fn.lower(*args)
+class BasePjitPartitioner(BasePartitioner):
+    """Partitioner that uses T5X version of jax.pjit."""
+    @cached_property
+    def _local_chunker(self) -> LocalChunker:
+        return LocalChunker(self.mesh)
+    @cached_property
+    def mesh(self) -> Mesh:
+        return default_mesh(self._num_partitions, self._model_parallel_submesh, self._backend)
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        pjitted = pjit(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh)
+    def compile(self, partitioned_fn: PjittedFnWithContext, *args) -> CompiledPartitionedCallable:
+        return partitioned_fn.lower(*args).compile()
+class PjitPartitioner(BasePjitPartitioner):
+    """Partitioner that uses named axes and jax.pjit."""
+    def __init__(
+        self,
+        num_partitions: Optional[int] = None,
+        model_parallel_submesh: Optional[HardwareMesh] = None,
+        params_on_devices: bool = True,
+        backend: Optional[str] = None,
+        logical_axis_rules: Optional[LogicalAxisRules] = None,
+        use_cpu_pjit: Optional[bool] = False,
+    ):
+        """PjitPartitioner constructor.
+        See https://github.com/google-research/text-to-text-transfer-transformer/blob/main/README.mdx/usage/partitioning for details.
+        Args:
+          num_partitions: an integer that specifies the size of the model parallel
+            submesh to be automatically selected for the current topology. See
+            `model_parallel_submesh` for details on how this submesh is used.
+            Mutually exlusive with `model_parallel_submesh`.
+          model_parallel_submesh: is a 4-tuple that specifies the `(x, y, z, c)`
+            submesh model-parallel device tile, an axis of accelerator parallelism
+            orthogonal to data parallelism. Array axes in a model's parameters or
+            activations can be sharded over this submesh using axis rules (see
+            `logical_axis_rules`) that map them to 'model'. The effective number of
+            model sub-partitions is equal to `np.prod(model_parallel_submesh)` and
+            must evenly divide the total number of devices (i.e.,
+            `jax.device_count() % np.prod(model_parallel_submesh) == 0`). The rest
+            of the TPU mesh is the data parallel submesh, providing
+            `jax.device_count() // np.prod(model_parallel_submesh)` partitions. It
+            is used for data (batch) parallelism and to shard other array axes that
+            are mapped to 'data'. This argument is mutually exclusive with
+            `num_partitions`.
+          params_on_devices: whether to keep the params on devices, if False -
+            params stay in the host memory. Note that some partitioners might ignore
+            this setting, for example if they don't support storing all params on
+            device memory.
+          backend: get devices from the pinned backend, if specified. This is
+            useful for explicitly specifying the devices other than relying on
+            jax_platform_name.
+          logical_axis_rules: a priority-ordered sequence of KV tuples that maps
+            logical axis names to either `None` (not sharded), 'model' (to shard
+            across the model-parallel submesh), or 'data' (to shard across the
+            data-parallel submesh).
+          use_cpu_pjit: enables wrapper function for pjit which just jits the
+            function if using CPU backend.
+        """
+        super().__init__(
+            num_partitions=num_partitions,
+            model_parallel_submesh=model_parallel_submesh,
+            params_on_devices=params_on_devices,
+            backend=backend,
+        )
+        if logical_axis_rules is None:
+            logical_axis_rules = standard_logical_axis_rules()
+        self._logical_axis_rules = tuple(logical_axis_rules)
+        (self._data_axis,) = flax_partitioning.logical_to_mesh_axes(["batch"], logical_axis_rules)
+        self._use_cpu_pjit = use_cpu_pjit
+    def partition(
+        self,
+        fn: Callable,  # pylint: disable=g-bare-generic
+        in_axis_resources,
+        out_axis_resources,
+        static_argnums: Union[int, Sequence[int]] = (),
+        donate_argnums: Union[int, Sequence[int]] = (),
+    ) -> PjittedFnWithContext:
+        """Partitions the function using jax.pjit."""
+        if self._use_cpu_pjit:
+            pjit_fn = pjit_with_cpu_fallback
+        else:
+            pjit_fn = pjit
+        pjitted = pjit_fn(
+            fn,
+            in_axis_resources=in_axis_resources,
+            out_axis_resources=out_axis_resources,
+            static_argnums=static_argnums,
+            donate_argnums=donate_argnums,
+            backend=self._backend,
+        )
+        return PjittedFnWithContext(pjitted, self.mesh, self._logical_axis_rules)
+    @property
+    def logical_axis_rules(self):
+        """Returns the logical axis rules."""
+        return self._logical_axis_rules
+    def get_logical_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[AxisNames] as leaves."""
+        return train_state.as_logical_axes()
+    def get_mesh_axes(self, train_state: TrainState) -> TrainState:
+        """Returns a copy of TrainState with Optional[PartitionSpecs] as leaves."""
+        logical_axes = self.get_logical_axes(train_state)
+        def _logical_to_mesh_axes(param_name, logical_axes):
+            if logical_axes is None:
+                return None
+            elif logical_axes is traverse_util.empty_node:
+                return traverse_util.empty_node
+            try:
+                return flax_partitioning.logical_to_mesh_axes(logical_axes, self._logical_axis_rules)
+            except ValueError as e:
+                raise ValueError(f"Failed to map logical axes for {param_name}") from e
+        flat_logical_axes = traverse_util.flatten_dict(logical_axes.state_dict(), keep_empty_nodes=True, sep="/")
+        flat_mesh_axes = {k: _logical_to_mesh_axes(k, v) for k, v in flat_logical_axes.items()}
+        return logical_axes.restore_state(traverse_util.unflatten_dict(flat_mesh_axes, sep="/"))

distil_whisper/pipeline.py ADDED Viewed

	@@ -0,0 +1,527 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Whisper JAX pipeline compatible with Distil Whisper checkpoints. Copied from https://github.com/sanchit-gandhi/whisper-jax/blob/main/whisper_jax/pipeline.py"""
+import math
+import jax
+import jax.numpy as jnp
+import numpy as np
+import requests
+import torch
+from flax import jax_utils
+from flax.core.frozen_dict import freeze
+from flax.training.common_utils import shard
+from transformers import WhisperFeatureExtractor, WhisperTokenizerFast
+from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
+from transformers.pipelines.audio_utils import ffmpeg_read
+from transformers.utils import logging
+from .modeling_flax_whisper import FlaxWhisperForConditionalGeneration
+logger = logging.get_logger(__name__)
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+class FlaxWhisperPipeline:
+    def __init__(
+        self,
+        checkpoint="openai/whisper-large-v2",
+        dtype=jnp.float32,
+        batch_size=None,
+        max_length=None,
+        **kwargs,
+    ):
+        """
+        Args
+            checkpoint (`str`, *optional*, defaults to `"openai/whisper-large-v2"):
+                The Whisper checkpoint to use with the pipeline. Must be an available checkpoint on the Hugging Face Hub
+                with Flax weights.
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs). This can be used to enable half-precision inference on GPUs or TPUs.
+                If specified all the computation will be performed with the given `dtype`. **Note that this only
+                specifies the dtype of the computation and does not influence the dtype of model parameters.**
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__init__` method will be superseded by any batch size passed to the `__call__` method.
+            max_length (`int`, *optional*):
+                The maximum numbers of tokens to generate. Defaults to `model.config.max_length`.
+        """
+        self.checkpoint = checkpoint
+        self.dtype = dtype
+        self.feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(self.checkpoint)
+        self.tokenizer = WhisperTokenizerFast.from_pretrained(self.checkpoint)
+        self.model, self.params = FlaxWhisperForConditionalGeneration.from_pretrained(
+            self.checkpoint,
+            _do_init=False,
+            dtype=self.dtype,
+            **kwargs,
+        )
+        self.max_length = max_length if max_length is not None else self.model.generation_config.max_length
+        self.min_batch_size = jax.local_device_count()
+        self.batch_size = (
+            batch_size if batch_size is not None else self.min_batch_size
+        )  # we need a minimum of 1 batch per-device
+        def generate(
+            params,
+            input_features,
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ):
+            output_ids = self.model.pipeline_generate(
+                input_features,
+                params=params,
+                forced_decoder_ids=forced_decoder_ids,
+                return_timestamps=return_timestamps,
+                max_length=self.max_length,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                do_sample=do_sample,
+                top_k=top_k,
+                temperature=temperature,
+            )
+            return output_ids
+        self.params = jax_utils.replicate(self.params)
+        self.p_generate = jax.pmap(
+            generate,
+            "input_features",
+            in_axes=(0, 0, None, None, None, None, None, None, None),
+            static_broadcasted_argnums=(
+                3,
+                4,
+                5,
+                6,
+                7,
+                8,
+            ),
+        )
+    def generate(
+        self,
+        input_features,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        forced_decoder_ids = self.get_forced_decoder_ids(
+            language=language, task=task, return_timestamps=return_timestamps
+        )
+        # if we're using pmap we need to manually replicate the input data across devices and gather the output tokens
+        output_ids = self.p_generate(
+            freeze(self.params),
+            shard(input_features),
+            forced_decoder_ids,
+            return_timestamps,
+            num_beams,
+            length_penalty,
+            do_sample,
+            top_k,
+            temperature,
+        ).sequences
+        output_ids = jax.device_get(output_ids.reshape(-1, self.max_length))
+        return output_ids
+    def get_forced_decoder_ids(self, generation_config=None, task=None, language=None, return_timestamps=False):
+        if generation_config is None:
+            generation_config = self.model.generation_config
+        if hasattr(generation_config, "is_multilingual"):
+            is_multilingual = generation_config.is_multilingual
+        else:
+            is_multilingual = None
+        forced_decoder_ids = []
+        if is_multilingual:
+            if language is not None:
+                language = language.lower()
+                if language in generation_config.lang_to_id.keys():
+                    language_token = language
+                elif language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{language}|>"
+                elif language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+                else:
+                    if len(language) == 2:
+                        # ISO 639-1 language code
+                        acceptable_languages = list(TO_LANGUAGE_CODE.values())
+                    elif "<" in language or "|" in language or ">" in language:
+                        # generation config language code
+                        acceptable_languages = list(generation_config.lang_to_id.keys())
+                    else:
+                        # language passed as a string
+                        acceptable_languages = list(TO_LANGUAGE_CODE.keys())
+                    raise ValueError(
+                        f"Unsupported language: {language}. Language should be one of:" f" {acceptable_languages}."
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            if task is not None:
+                forced_decoder_ids.append((2, generation_config.task_to_id[task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+        if not return_timestamps:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+            else:
+                forced_decoder_ids.append((1, generation_config.no_timestamps_token_id))
+        return forced_decoder_ids
+    def chunk_iter_with_batch(self, inputs, chunk_len, stride_left, stride_right, batch_size):
+        inputs_len = inputs.shape[0]
+        step = chunk_len - stride_left - stride_right
+        all_chunk_start_idx = np.arange(0, inputs_len, step)
+        num_samples = len(all_chunk_start_idx)
+        num_batches = math.ceil(num_samples / batch_size)
+        batch_idx = np.array_split(np.arange(num_samples), num_batches)
+        for idx in batch_idx:
+            chunk_start_idx = all_chunk_start_idx[idx]
+            chunk_end_idx = chunk_start_idx + chunk_len
+            chunks = [inputs[chunk_start:chunk_end] for chunk_start, chunk_end in zip(chunk_start_idx, chunk_end_idx)]
+            processed = self.feature_extractor(
+                chunks, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            _stride_left = np.where(chunk_start_idx == 0, 0, stride_left)
+            is_last = np.where(stride_right > 0, chunk_end_idx > inputs_len, chunk_end_idx >= inputs_len)
+            _stride_right = np.where(is_last, 0, stride_right)
+            chunk_lens = [chunk.shape[0] for chunk in chunks]
+            strides = [
+                (chunk_l, _stride_l, _stride_r)
+                for chunk_l, _stride_l, _stride_r in zip(chunk_lens, _stride_left, _stride_right)
+            ]
+            yield {"stride": strides, **processed}
+    def preprocess_batch(self, inputs, chunk_length_s=30.0, stride_length_s=None, batch_size=None):
+        if isinstance(inputs, np.ndarray):
+            logger.warning(
+                "Numpy array passed as input - no sampling rate checks will be performed."
+                "It is strongly recommended to pass the input as a dictionary with an 'array' key "
+                "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                "containing the sampling rate associated with the audio array."
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+        stride = None
+        if isinstance(inputs, dict):
+            stride = inputs.get("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and "array" in inputs):
+                raise ValueError(
+                    "When passing a dictionary to FlaxWhisperPipline, the dict needs to contain an 'array' key "
+                    "containing the numpy array representing the audio, and a 'sampling_rate' key "
+                    "containing the sampling rate associated with the audio array."
+                )
+            in_sampling_rate = inputs.get("sampling_rate")
+            inputs = inputs.get("array", None)
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                try:
+                    import librosa
+                except ImportError as err:
+                    raise ImportError(
+                        "To support resampling audio files, please install 'librosa' and 'soundfile'."
+                    ) from err
+                inputs = librosa.resample(
+                    inputs, orig_sr=in_sampling_rate, target_sr=self.feature_extractor.sampling_rate
+                )
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+        if not isinstance(inputs, np.ndarray):
+            raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+        if stride is not None:
+            if stride[0] + stride[1] > inputs.shape[0]:
+                raise ValueError("Stride is too large for input")
+            # Stride needs to get the chunk length here, it's going to get
+            # swallowed by the `feature_extractor` later, and then batching
+            # can add extra data in the inputs, so we need to keep track
+            # of the original length in the stride so we can cut properly.
+            stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+            chunk_len = round(chunk_length_s * self.feature_extractor.sampling_rate)
+            stride_left = round(stride_length_s[0] * self.feature_extractor.sampling_rate)
+            stride_right = round(stride_length_s[1] * self.feature_extractor.sampling_rate)
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+            for item in self.chunk_iter_with_batch(
+                inputs,
+                chunk_len,
+                stride_left,
+                stride_right,
+                batch_size,
+            ):
+                yield item
+        else:
+            processed = self.feature_extractor(
+                inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="np"
+            )
+            if stride is not None:
+                processed["stride"] = stride
+            yield processed
+    def postprocess(self, model_outputs, return_timestamps=None, return_language=None):
+        # unpack the outputs from list(dict(list)) to list(dict)
+        model_outputs = [dict(zip(output, t)) for output in model_outputs for t in zip(*output.values())]
+        time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+        # Send the chunking back to seconds, it's easier to handle in whisper
+        sampling_rate = self.feature_extractor.sampling_rate
+        for output in model_outputs:
+            if "stride" in output:
+                chunk_len, stride_left, stride_right = output["stride"]
+                # Go back in seconds
+                chunk_len /= sampling_rate
+                stride_left /= sampling_rate
+                stride_right /= sampling_rate
+                output["stride"] = chunk_len, stride_left, stride_right
+        text, optional = self.tokenizer._decode_asr(
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+        return {"text": text, **optional}
+    def forward(
+        self,
+        model_inputs,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=False,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        # We need to keep track of some additional input arguments for post-processing so need to forward these on after running generation
+        input_features = model_inputs.pop("input_features")
+        input_batch_size = input_features.shape[0]
+        if input_batch_size != batch_size:
+            padding = np.zeros([batch_size - input_batch_size, *input_features.shape[1:]], input_features.dtype)
+            input_features = np.concatenate([input_features, padding])
+        pred_ids = self.generate(
+            input_features,
+            language=language,
+            task=task,
+            return_timestamps=return_timestamps,
+            num_beams=num_beams,
+            length_penalty=length_penalty,
+            do_sample=do_sample,
+            top_k=top_k,
+            temperature=temperature,
+        )[:input_batch_size]
+        # tokenizer's decode method expects an extra dim - we insert it here for convenience
+        out = {"tokens": pred_ids[:, None, :]}
+        stride = model_inputs.pop("stride", None)
+        if stride is not None:
+            out["stride"] = stride
+        return out
+    def __call__(
+        self,
+        inputs,
+        chunk_length_s=30.0,
+        stride_length_s=None,
+        batch_size=None,
+        language=None,
+        task=None,
+        return_timestamps=None,
+        num_beams=1,
+        length_penalty=1.0,
+        do_sample=False,
+        top_k=50,
+        temperature=1.0,
+    ):
+        """
+        Transcribe an audio input sequence to a text transcription, optionally with timestamps.
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either:
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` is the byte content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio assumed to be at the correct sampling rate (16kHz). Note that no further sampling
+                        rate check will be done.
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "array":
+                      np.array}`. Optionally an additional argument `"stride": (left: int, right: int)` can be used to
+                       ask the pipeline to treat the first `left` samples and last `right` samples to be ignored in
+                       decoding (but used at inference to provide more context to the model). In general, this additional
+                       stride argument is not required.
+            chunk_length_s (`float`, *optional*, defaults to 30.0):
+                The input length for each chunk. If `chunk_length_s = 0` then chunking is disabled. By default, the chunk
+                length is set 30.0s, equal to Whisper's context window.
+            stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+                The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+                the model to *see* more context and infer letters better than without this context but the pipeline
+                discards the stride bits at the end to make the final reconstitution as perfect as possible.
+                <Tip>
+                For more information on how to effectively use `stride_length_s`, refer to the [ASR chunking
+                blog post](https://huggingface.co/blog/asr-chunking).
+                </Tip>
+            batch_size (`int`, *optional*, defaults to the minimum per-device batch size, i.e. `jax.local_device_count()`):
+                The batch size to be used in chunking transcription. Beneficial for transcribing long audio files. Passing
+                a batch size in the `__call__` method will supersede any batch size passed to the `__init__`.
+            task (`str`, *optional*):
+                Task to use for generation, either `"transcribe"` or `"translate"`. Defaults to `"transcribe"`.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `"<|en|>"`, `"en"` or `"english"`.
+                Defaults to `None`, meaning the language is automatically inferred from the audio input.
+            return_timestamps (*optional*, `bool`):
+                Whether to return timestamps in the prediction. Defaults to False. If set to true, the pipeline
+                will return two keys in the output dictionary: `"text"` containing the text transcription, and `"chunks"`
+                containing the transcription segments chunked by their utterance-level timestamps.
+            length_penalty (*optional*, `float`):
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an
+                exponent to the sequence length, which in turn is used to divide the score of the sequence. Since
+                the score is the log likelihood of the sequence (i.e. negative), length_penalty > 1.0 promotes
+                longer sequences, while length_penalty < 1.0 encourages shorter sequences.
+            do_sample (*optional*, `bool`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            top_k (*optional*, `int`):
+                The number of the highest probability vocabulary tokens to keep for top-k-filtering.
+            temperature (*optional*, `float`):
+                The value used to modulate the next token probabilities if sampling.
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str` ) -- The recognised text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamps": (0.5,0.9), {"text":
+                    "there", "timestamps": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        batch_size = batch_size if batch_size is not None else self.batch_size
+        if batch_size % self.min_batch_size != 0:
+            raise ValueError(
+                f"Batch size must be a multiple of the number of JAX devices, but got batch size {batch_size} and num devices {self.min_batch_size}."
+            )
+        dataloader = self.preprocess_batch(
+            inputs, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, batch_size=batch_size
+        )
+        model_outputs = []
+        # iterate over our chunked audio samples
+        for batch in dataloader:
+            model_outputs.append(
+                self.forward(
+                    batch,
+                    batch_size=batch_size,
+                    language=language,
+                    task=task,
+                    return_timestamps=return_timestamps,
+                    num_beams=num_beams,
+                    length_penalty=length_penalty,
+                    do_sample=do_sample,
+                    top_k=top_k,
+                    temperature=temperature,
+                )
+            )
+        post_processed = self.postprocess(model_outputs, return_timestamps=return_timestamps)
+        return post_processed

distil_whisper/train_state.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import Any, Mapping, MutableMapping, Optional, Tuple
+import flax.core
+import flax.serialization
+import flax.struct
+import jax.numpy as jnp
+from flax import traverse_util
+from flax.core import scope as flax_scope
+from flax.linen import partitioning as flax_partitioning
+EMPTY_DICT = flax.core.freeze({})
+FrozenDict = flax_scope.FrozenDict
+FrozenVariableDict = flax_scope.FrozenVariableDict
+MutableVariableDict = flax_scope.MutableVariableDict
+VariableDict = flax_scope.VariableDict
+def _validate_params_axes(params_axes, params):
+    axis_names = flax_partitioning.get_axis_names(params_axes)
+    missing_params_axes = set(traverse_util.flatten_dict(params, sep="/")) - set(
+        traverse_util.flatten_dict(axis_names, sep="/")
+    )
+    if missing_params_axes:
+        raise ValueError(f"Missing axis names for parameters: {missing_params_axes}")
+def _split_variables_and_axes(
+    variables_and_axes: FrozenVariableDict,
+) -> Tuple[FrozenVariableDict, FrozenVariableDict]:
+    """Splits `variables_and_axes` into two separate dicts with the same keys."""
+    # For each `key`, `key_axes` (if any) are its axes in `variables_and_axes`.
+    variables = {}
+    axes = {}
+    for k, v in variables_and_axes.items():
+        if k.endswith("_axes"):
+            axes[k[:-5]] = v  # k without "_axes".
+            _validate_params_axes(v, variables_and_axes[k[:-5]])  # k without "_axes".
+        else:
+            variables[k] = v
+    return flax.core.freeze(variables), flax.core.freeze(axes)
+class InferenceState(flax.struct.PyTreeNode):
+    """State compatible with FlaxOptimTrainState without optimizer state."""
+    step: jnp.ndarray
+    params: flax_scope.FrozenVariableDict
+    params_axes: Optional[flax_scope.FrozenVariableDict] = None
+    flax_mutables: flax_scope.FrozenDict = EMPTY_DICT
+    flax_mutables_axes: Optional[flax_scope.FrozenVariableDict] = None
+    @classmethod
+    def create(cls, model_variables: FrozenVariableDict) -> "InferenceState":
+        other_variables, params = model_variables.pop("params")
+        if "params_axes" in other_variables:
+            other_variables, params_axes = other_variables.pop("params_axes")
+            _validate_params_axes(params_axes, params)
+        else:
+            params_axes = None
+        # Split other_variables into mutables and their corresponding axes.
+        flax_mutables, flax_mutables_axes = _split_variables_and_axes(other_variables)
+        flax_mutables_axes = flax_mutables_axes or None
+        return InferenceState(
+            step=jnp.array(0),
+            params=params,
+            params_axes=params_axes,
+            flax_mutables=flax_mutables,
+            flax_mutables_axes=flax_mutables_axes,
+        )
+    @property
+    def param_states(self) -> FrozenVariableDict:
+        """The optimizer states of the parameters as a PyTree."""
+        raise NotImplementedError("InferenceState has no optimizer states.")
+    def apply_gradient(self, *args, **kwargs) -> "InferenceState":
+        raise NotImplementedError("InferenceState does not support `apply_gradient`.")
+    def state_dict(self) -> MutableMapping[str, Any]:
+        state_dict = {
+            "target": flax.core.unfreeze(self.params),
+            "state": {"step": self.step},
+        }
+        if self.flax_mutables:
+            state_dict["flax_mutables"] = flax.core.unfreeze(self.flax_mutables)
+        return state_dict
+    def replace_step(self, step: jnp.ndarray) -> "InferenceState":
+        return self.replace(step=step)
+    def replace_params(self, params: FrozenVariableDict) -> "InferenceState":
+        return self.replace(params=params)
+    def replace_flax_mutables(self, flax_mutables: FrozenDict) -> "InferenceState":
+        return self.replace(flax_mutables=flax_mutables)
+    def restore_state(self, state_dict: Mapping[str, Any]) -> "InferenceState":
+        return self.replace(
+            params=flax.core.freeze(state_dict["target"]),
+            step=state_dict["state"]["step"],
+            flax_mutables=(
+                flax.core.freeze(state_dict["flax_mutables"]) if "flax_mutables" in state_dict else EMPTY_DICT
+            ),
+        )
+    def as_logical_axes(self) -> "InferenceState":
+        # Set step to None so that when the logical axes are processed by the
+        # flax.partitioning.logical_to_mesh_axes function, it will be skipped
+        # because jax.tree_map will short circut and never call the function on the
+        # step.
+        flax_mutables_axes = self.flax_mutables_axes or EMPTY_DICT
+        return InferenceState(
+            step=None,
+            params=flax_partitioning.get_axis_names(self.params_axes),
+            flax_mutables=flax_partitioning.get_axis_names(flax_mutables_axes),
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,271 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": [
+    [
+      1,
+      50288
+    ],
+    [
+      2,
+      50360
+    ],
+    [
+      3,
+      50364
+    ]
+  ],
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "<|no|>",
+  "max_initial_timestamp_index": 1,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task": "transcribe",
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.46.1",
+  "use_scan": false
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nb-distil-large-init/added_tokens.json ADDED Viewed

	@@ -0,0 +1,1611 @@

+{
+  "<|0.00|>": 50365,
+  "<|0.02|>": 50366,
+  "<|0.04|>": 50367,
+  "<|0.06|>": 50368,
+  "<|0.08|>": 50369,
+  "<|0.10|>": 50370,
+  "<|0.12|>": 50371,
+  "<|0.14|>": 50372,
+  "<|0.16|>": 50373,
+  "<|0.18|>": 50374,
+  "<|0.20|>": 50375,
+  "<|0.22|>": 50376,
+  "<|0.24|>": 50377,
+  "<|0.26|>": 50378,
+  "<|0.28|>": 50379,
+  "<|0.30|>": 50380,
+  "<|0.32|>": 50381,
+  "<|0.34|>": 50382,
+  "<|0.36|>": 50383,
+  "<|0.38|>": 50384,
+  "<|0.40|>": 50385,
+  "<|0.42|>": 50386,
+  "<|0.44|>": 50387,
+  "<|0.46|>": 50388,
+  "<|0.48|>": 50389,
+  "<|0.50|>": 50390,
+  "<|0.52|>": 50391,
+  "<|0.54|>": 50392,
+  "<|0.56|>": 50393,
+  "<|0.58|>": 50394,
+  "<|0.60|>": 50395,
+  "<|0.62|>": 50396,
+  "<|0.64|>": 50397,
+  "<|0.66|>": 50398,
+  "<|0.68|>": 50399,
+  "<|0.70|>": 50400,
+  "<|0.72|>": 50401,
+  "<|0.74|>": 50402,
+  "<|0.76|>": 50403,
+  "<|0.78|>": 50404,
+  "<|0.80|>": 50405,
+  "<|0.82|>": 50406,
+  "<|0.84|>": 50407,
+  "<|0.86|>": 50408,
+  "<|0.88|>": 50409,
+  "<|0.90|>": 50410,
+  "<|0.92|>": 50411,
+  "<|0.94|>": 50412,
+  "<|0.96|>": 50413,
+  "<|0.98|>": 50414,
+  "<|1.00|>": 50415,
+  "<|1.02|>": 50416,
+  "<|1.04|>": 50417,
+  "<|1.06|>": 50418,
+  "<|1.08|>": 50419,
+  "<|1.10|>": 50420,
+  "<|1.12|>": 50421,
+  "<|1.14|>": 50422,
+  "<|1.16|>": 50423,
+  "<|1.18|>": 50424,
+  "<|1.20|>": 50425,
+  "<|1.22|>": 50426,
+  "<|1.24|>": 50427,
+  "<|1.26|>": 50428,
+  "<|1.28|>": 50429,
+  "<|1.30|>": 50430,
+  "<|1.32|>": 50431,
+  "<|1.34|>": 50432,
+  "<|1.36|>": 50433,
+  "<|1.38|>": 50434,
+  "<|1.40|>": 50435,
+  "<|1.42|>": 50436,
+  "<|1.44|>": 50437,
+  "<|1.46|>": 50438,
+  "<|1.48|>": 50439,
+  "<|1.50|>": 50440,
+  "<|1.52|>": 50441,
+  "<|1.54|>": 50442,
+  "<|1.56|>": 50443,
+  "<|1.58|>": 50444,
+  "<|1.60|>": 50445,
+  "<|1.62|>": 50446,
+  "<|1.64|>": 50447,
+  "<|1.66|>": 50448,
+  "<|1.68|>": 50449,
+  "<|1.70|>": 50450,
+  "<|1.72|>": 50451,
+  "<|1.74|>": 50452,
+  "<|1.76|>": 50453,
+  "<|1.78|>": 50454,
+  "<|1.80|>": 50455,
+  "<|1.82|>": 50456,
+  "<|1.84|>": 50457,
+  "<|1.86|>": 50458,
+  "<|1.88|>": 50459,
+  "<|1.90|>": 50460,
+  "<|1.92|>": 50461,
+  "<|1.94|>": 50462,
+  "<|1.96|>": 50463,
+  "<|1.98|>": 50464,
+  "<|10.00|>": 50865,
+  "<|10.02|>": 50866,
+  "<|10.04|>": 50867,
+  "<|10.06|>": 50868,
+  "<|10.08|>": 50869,
+  "<|10.10|>": 50870,
+  "<|10.12|>": 50871,
+  "<|10.14|>": 50872,
+  "<|10.16|>": 50873,
+  "<|10.18|>": 50874,
+  "<|10.20|>": 50875,
+  "<|10.22|>": 50876,
+  "<|10.24|>": 50877,
+  "<|10.26|>": 50878,
+  "<|10.28|>": 50879,
+  "<|10.30|>": 50880,
+  "<|10.32|>": 50881,
+  "<|10.34|>": 50882,
+  "<|10.36|>": 50883,
+  "<|10.38|>": 50884,
+  "<|10.40|>": 50885,
+  "<|10.42|>": 50886,
+  "<|10.44|>": 50887,
+  "<|10.46|>": 50888,
+  "<|10.48|>": 50889,
+  "<|10.50|>": 50890,
+  "<|10.52|>": 50891,
+  "<|10.54|>": 50892,
+  "<|10.56|>": 50893,
+  "<|10.58|>": 50894,
+  "<|10.60|>": 50895,
+  "<|10.62|>": 50896,
+  "<|10.64|>": 50897,
+  "<|10.66|>": 50898,
+  "<|10.68|>": 50899,
+  "<|10.70|>": 50900,
+  "<|10.72|>": 50901,
+  "<|10.74|>": 50902,
+  "<|10.76|>": 50903,
+  "<|10.78|>": 50904,
+  "<|10.80|>": 50905,
+  "<|10.82|>": 50906,
+  "<|10.84|>": 50907,
+  "<|10.86|>": 50908,
+  "<|10.88|>": 50909,
+  "<|10.90|>": 50910,
+  "<|10.92|>": 50911,
+  "<|10.94|>": 50912,
+  "<|10.96|>": 50913,
+  "<|10.98|>": 50914,
+  "<|11.00|>": 50915,
+  "<|11.02|>": 50916,
+  "<|11.04|>": 50917,
+  "<|11.06|>": 50918,
+  "<|11.08|>": 50919,
+  "<|11.10|>": 50920,
+  "<|11.12|>": 50921,
+  "<|11.14|>": 50922,
+  "<|11.16|>": 50923,
+  "<|11.18|>": 50924,
+  "<|11.20|>": 50925,
+  "<|11.22|>": 50926,
+  "<|11.24|>": 50927,
+  "<|11.26|>": 50928,
+  "<|11.28|>": 50929,
+  "<|11.30|>": 50930,
+  "<|11.32|>": 50931,
+  "<|11.34|>": 50932,
+  "<|11.36|>": 50933,
+  "<|11.38|>": 50934,
+  "<|11.40|>": 50935,
+  "<|11.42|>": 50936,
+  "<|11.44|>": 50937,
+  "<|11.46|>": 50938,
+  "<|11.48|>": 50939,
+  "<|11.50|>": 50940,
+  "<|11.52|>": 50941,
+  "<|11.54|>": 50942,
+  "<|11.56|>": 50943,
+  "<|11.58|>": 50944,
+  "<|11.60|>": 50945,
+  "<|11.62|>": 50946,
+  "<|11.64|>": 50947,
+  "<|11.66|>": 50948,
+  "<|11.68|>": 50949,
+  "<|11.70|>": 50950,
+  "<|11.72|>": 50951,
+  "<|11.74|>": 50952,
+  "<|11.76|>": 50953,
+  "<|11.78|>": 50954,
+  "<|11.80|>": 50955,
+  "<|11.82|>": 50956,
+  "<|11.84|>": 50957,
+  "<|11.86|>": 50958,
+  "<|11.88|>": 50959,
+  "<|11.90|>": 50960,
+  "<|11.92|>": 50961,
+  "<|11.94|>": 50962,
+  "<|11.96|>": 50963,
+  "<|11.98|>": 50964,
+  "<|12.00|>": 50965,
+  "<|12.02|>": 50966,
+  "<|12.04|>": 50967,
+  "<|12.06|>": 50968,
+  "<|12.08|>": 50969,
+  "<|12.10|>": 50970,
+  "<|12.12|>": 50971,
+  "<|12.14|>": 50972,
+  "<|12.16|>": 50973,
+  "<|12.18|>": 50974,
+  "<|12.20|>": 50975,
+  "<|12.22|>": 50976,
+  "<|12.24|>": 50977,
+  "<|12.26|>": 50978,
+  "<|12.28|>": 50979,
+  "<|12.30|>": 50980,
+  "<|12.32|>": 50981,
+  "<|12.34|>": 50982,
+  "<|12.36|>": 50983,
+  "<|12.38|>": 50984,
+  "<|12.40|>": 50985,
+  "<|12.42|>": 50986,
+  "<|12.44|>": 50987,
+  "<|12.46|>": 50988,
+  "<|12.48|>": 50989,
+  "<|12.50|>": 50990,
+  "<|12.52|>": 50991,
+  "<|12.54|>": 50992,
+  "<|12.56|>": 50993,
+  "<|12.58|>": 50994,
+  "<|12.60|>": 50995,
+  "<|12.62|>": 50996,
+  "<|12.64|>": 50997,
+  "<|12.66|>": 50998,
+  "<|12.68|>": 50999,
+  "<|12.70|>": 51000,
+  "<|12.72|>": 51001,
+  "<|12.74|>": 51002,
+  "<|12.76|>": 51003,
+  "<|12.78|>": 51004,
+  "<|12.80|>": 51005,
+  "<|12.82|>": 51006,
+  "<|12.84|>": 51007,
+  "<|12.86|>": 51008,
+  "<|12.88|>": 51009,
+  "<|12.90|>": 51010,
+  "<|12.92|>": 51011,
+  "<|12.94|>": 51012,
+  "<|12.96|>": 51013,
+  "<|12.98|>": 51014,
+  "<|13.00|>": 51015,
+  "<|13.02|>": 51016,
+  "<|13.04|>": 51017,
+  "<|13.06|>": 51018,
+  "<|13.08|>": 51019,
+  "<|13.10|>": 51020,
+  "<|13.12|>": 51021,
+  "<|13.14|>": 51022,
+  "<|13.16|>": 51023,
+  "<|13.18|>": 51024,
+  "<|13.20|>": 51025,
+  "<|13.22|>": 51026,
+  "<|13.24|>": 51027,
+  "<|13.26|>": 51028,
+  "<|13.28|>": 51029,
+  "<|13.30|>": 51030,
+  "<|13.32|>": 51031,
+  "<|13.34|>": 51032,
+  "<|13.36|>": 51033,
+  "<|13.38|>": 51034,
+  "<|13.40|>": 51035,
+  "<|13.42|>": 51036,
+  "<|13.44|>": 51037,
+  "<|13.46|>": 51038,
+  "<|13.48|>": 51039,
+  "<|13.50|>": 51040,
+  "<|13.52|>": 51041,
+  "<|13.54|>": 51042,
+  "<|13.56|>": 51043,
+  "<|13.58|>": 51044,
+  "<|13.60|>": 51045,
+  "<|13.62|>": 51046,
+  "<|13.64|>": 51047,
+  "<|13.66|>": 51048,
+  "<|13.68|>": 51049,
+  "<|13.70|>": 51050,
+  "<|13.72|>": 51051,
+  "<|13.74|>": 51052,
+  "<|13.76|>": 51053,
+  "<|13.78|>": 51054,
+  "<|13.80|>": 51055,
+  "<|13.82|>": 51056,
+  "<|13.84|>": 51057,
+  "<|13.86|>": 51058,
+  "<|13.88|>": 51059,
+  "<|13.90|>": 51060,
+  "<|13.92|>": 51061,
+  "<|13.94|>": 51062,
+  "<|13.96|>": 51063,
+  "<|13.98|>": 51064,
+  "<|14.00|>": 51065,
+  "<|14.02|>": 51066,
+  "<|14.04|>": 51067,
+  "<|14.06|>": 51068,
+  "<|14.08|>": 51069,
+  "<|14.10|>": 51070,
+  "<|14.12|>": 51071,
+  "<|14.14|>": 51072,
+  "<|14.16|>": 51073,
+  "<|14.18|>": 51074,
+  "<|14.20|>": 51075,
+  "<|14.22|>": 51076,
+  "<|14.24|>": 51077,
+  "<|14.26|>": 51078,
+  "<|14.28|>": 51079,
+  "<|14.30|>": 51080,
+  "<|14.32|>": 51081,
+  "<|14.34|>": 51082,
+  "<|14.36|>": 51083,
+  "<|14.38|>": 51084,
+  "<|14.40|>": 51085,
+  "<|14.42|>": 51086,
+  "<|14.44|>": 51087,
+  "<|14.46|>": 51088,
+  "<|14.48|>": 51089,
+  "<|14.50|>": 51090,
+  "<|14.52|>": 51091,
+  "<|14.54|>": 51092,
+  "<|14.56|>": 51093,
+  "<|14.58|>": 51094,
+  "<|14.60|>": 51095,
+  "<|14.62|>": 51096,
+  "<|14.64|>": 51097,
+  "<|14.66|>": 51098,
+  "<|14.68|>": 51099,
+  "<|14.70|>": 51100,
+  "<|14.72|>": 51101,
+  "<|14.74|>": 51102,
+  "<|14.76|>": 51103,
+  "<|14.78|>": 51104,
+  "<|14.80|>": 51105,
+  "<|14.82|>": 51106,
+  "<|14.84|>": 51107,
+  "<|14.86|>": 51108,
+  "<|14.88|>": 51109,
+  "<|14.90|>": 51110,
+  "<|14.92|>": 51111,
+  "<|14.94|>": 51112,
+  "<|14.96|>": 51113,
+  "<|14.98|>": 51114,
+  "<|15.00|>": 51115,
+  "<|15.02|>": 51116,
+  "<|15.04|>": 51117,
+  "<|15.06|>": 51118,
+  "<|15.08|>": 51119,
+  "<|15.10|>": 51120,
+  "<|15.12|>": 51121,
+  "<|15.14|>": 51122,
+  "<|15.16|>": 51123,
+  "<|15.18|>": 51124,
+  "<|15.20|>": 51125,
+  "<|15.22|>": 51126,
+  "<|15.24|>": 51127,
+  "<|15.26|>": 51128,
+  "<|15.28|>": 51129,
+  "<|15.30|>": 51130,
+  "<|15.32|>": 51131,
+  "<|15.34|>": 51132,
+  "<|15.36|>": 51133,
+  "<|15.38|>": 51134,
+  "<|15.40|>": 51135,
+  "<|15.42|>": 51136,
+  "<|15.44|>": 51137,
+  "<|15.46|>": 51138,
+  "<|15.48|>": 51139,
+  "<|15.50|>": 51140,
+  "<|15.52|>": 51141,
+  "<|15.54|>": 51142,
+  "<|15.56|>": 51143,
+  "<|15.58|>": 51144,
+  "<|15.60|>": 51145,
+  "<|15.62|>": 51146,
+  "<|15.64|>": 51147,
+  "<|15.66|>": 51148,
+  "<|15.68|>": 51149,
+  "<|15.70|>": 51150,
+  "<|15.72|>": 51151,
+  "<|15.74|>": 51152,
+  "<|15.76|>": 51153,
+  "<|15.78|>": 51154,
+  "<|15.80|>": 51155,
+  "<|15.82|>": 51156,
+  "<|15.84|>": 51157,
+  "<|15.86|>": 51158,
+  "<|15.88|>": 51159,
+  "<|15.90|>": 51160,
+  "<|15.92|>": 51161,
+  "<|15.94|>": 51162,
+  "<|15.96|>": 51163,
+  "<|15.98|>": 51164,
+  "<|16.00|>": 51165,
+  "<|16.02|>": 51166,
+  "<|16.04|>": 51167,
+  "<|16.06|>": 51168,
+  "<|16.08|>": 51169,
+  "<|16.10|>": 51170,
+  "<|16.12|>": 51171,
+  "<|16.14|>": 51172,
+  "<|16.16|>": 51173,
+  "<|16.18|>": 51174,
+  "<|16.20|>": 51175,
+  "<|16.22|>": 51176,
+  "<|16.24|>": 51177,
+  "<|16.26|>": 51178,
+  "<|16.28|>": 51179,
+  "<|16.30|>": 51180,
+  "<|16.32|>": 51181,
+  "<|16.34|>": 51182,
+  "<|16.36|>": 51183,
+  "<|16.38|>": 51184,
+  "<|16.40|>": 51185,
+  "<|16.42|>": 51186,
+  "<|16.44|>": 51187,
+  "<|16.46|>": 51188,
+  "<|16.48|>": 51189,
+  "<|16.50|>": 51190,
+  "<|16.52|>": 51191,
+  "<|16.54|>": 51192,
+  "<|16.56|>": 51193,
+  "<|16.58|>": 51194,
+  "<|16.60|>": 51195,
+  "<|16.62|>": 51196,
+  "<|16.64|>": 51197,
+  "<|16.66|>": 51198,
+  "<|16.68|>": 51199,
+  "<|16.70|>": 51200,
+  "<|16.72|>": 51201,
+  "<|16.74|>": 51202,
+  "<|16.76|>": 51203,
+  "<|16.78|>": 51204,
+  "<|16.80|>": 51205,
+  "<|16.82|>": 51206,
+  "<|16.84|>": 51207,
+  "<|16.86|>": 51208,
+  "<|16.88|>": 51209,
+  "<|16.90|>": 51210,
+  "<|16.92|>": 51211,
+  "<|16.94|>": 51212,
+  "<|16.96|>": 51213,
+  "<|16.98|>": 51214,
+  "<|17.00|>": 51215,
+  "<|17.02|>": 51216,
+  "<|17.04|>": 51217,
+  "<|17.06|>": 51218,
+  "<|17.08|>": 51219,
+  "<|17.10|>": 51220,
+  "<|17.12|>": 51221,
+  "<|17.14|>": 51222,
+  "<|17.16|>": 51223,
+  "<|17.18|>": 51224,
+  "<|17.20|>": 51225,
+  "<|17.22|>": 51226,
+  "<|17.24|>": 51227,
+  "<|17.26|>": 51228,
+  "<|17.28|>": 51229,
+  "<|17.30|>": 51230,
+  "<|17.32|>": 51231,
+  "<|17.34|>": 51232,
+  "<|17.36|>": 51233,
+  "<|17.38|>": 51234,
+  "<|17.40|>": 51235,
+  "<|17.42|>": 51236,
+  "<|17.44|>": 51237,
+  "<|17.46|>": 51238,
+  "<|17.48|>": 51239,
+  "<|17.50|>": 51240,
+  "<|17.52|>": 51241,
+  "<|17.54|>": 51242,
+  "<|17.56|>": 51243,
+  "<|17.58|>": 51244,
+  "<|17.60|>": 51245,
+  "<|17.62|>": 51246,
+  "<|17.64|>": 51247,
+  "<|17.66|>": 51248,
+  "<|17.68|>": 51249,
+  "<|17.70|>": 51250,
+  "<|17.72|>": 51251,
+  "<|17.74|>": 51252,
+  "<|17.76|>": 51253,
+  "<|17.78|>": 51254,
+  "<|17.80|>": 51255,
+  "<|17.82|>": 51256,
+  "<|17.84|>": 51257,
+  "<|17.86|>": 51258,
+  "<|17.88|>": 51259,
+  "<|17.90|>": 51260,
+  "<|17.92|>": 51261,
+  "<|17.94|>": 51262,
+  "<|17.96|>": 51263,
+  "<|17.98|>": 51264,
+  "<|18.00|>": 51265,
+  "<|18.02|>": 51266,
+  "<|18.04|>": 51267,
+  "<|18.06|>": 51268,
+  "<|18.08|>": 51269,
+  "<|18.10|>": 51270,
+  "<|18.12|>": 51271,
+  "<|18.14|>": 51272,
+  "<|18.16|>": 51273,
+  "<|18.18|>": 51274,
+  "<|18.20|>": 51275,
+  "<|18.22|>": 51276,
+  "<|18.24|>": 51277,
+  "<|18.26|>": 51278,
+  "<|18.28|>": 51279,
+  "<|18.30|>": 51280,
+  "<|18.32|>": 51281,
+  "<|18.34|>": 51282,
+  "<|18.36|>": 51283,
+  "<|18.38|>": 51284,
+  "<|18.40|>": 51285,
+  "<|18.42|>": 51286,
+  "<|18.44|>": 51287,
+  "<|18.46|>": 51288,
+  "<|18.48|>": 51289,
+  "<|18.50|>": 51290,
+  "<|18.52|>": 51291,
+  "<|18.54|>": 51292,
+  "<|18.56|>": 51293,
+  "<|18.58|>": 51294,
+  "<|18.60|>": 51295,
+  "<|18.62|>": 51296,
+  "<|18.64|>": 51297,
+  "<|18.66|>": 51298,
+  "<|18.68|>": 51299,
+  "<|18.70|>": 51300,
+  "<|18.72|>": 51301,
+  "<|18.74|>": 51302,
+  "<|18.76|>": 51303,
+  "<|18.78|>": 51304,
+  "<|18.80|>": 51305,
+  "<|18.82|>": 51306,
+  "<|18.84|>": 51307,
+  "<|18.86|>": 51308,
+  "<|18.88|>": 51309,
+  "<|18.90|>": 51310,
+  "<|18.92|>": 51311,
+  "<|18.94|>": 51312,
+  "<|18.96|>": 51313,
+  "<|18.98|>": 51314,
+  "<|19.00|>": 51315,
+  "<|19.02|>": 51316,
+  "<|19.04|>": 51317,
+  "<|19.06|>": 51318,
+  "<|19.08|>": 51319,
+  "<|19.10|>": 51320,
+  "<|19.12|>": 51321,
+  "<|19.14|>": 51322,
+  "<|19.16|>": 51323,
+  "<|19.18|>": 51324,
+  "<|19.20|>": 51325,
+  "<|19.22|>": 51326,
+  "<|19.24|>": 51327,
+  "<|19.26|>": 51328,
+  "<|19.28|>": 51329,
+  "<|19.30|>": 51330,
+  "<|19.32|>": 51331,
+  "<|19.34|>": 51332,
+  "<|19.36|>": 51333,
+  "<|19.38|>": 51334,
+  "<|19.40|>": 51335,
+  "<|19.42|>": 51336,
+  "<|19.44|>": 51337,
+  "<|19.46|>": 51338,
+  "<|19.48|>": 51339,
+  "<|19.50|>": 51340,
+  "<|19.52|>": 51341,
+  "<|19.54|>": 51342,
+  "<|19.56|>": 51343,
+  "<|19.58|>": 51344,
+  "<|19.60|>": 51345,
+  "<|19.62|>": 51346,
+  "<|19.64|>": 51347,
+  "<|19.66|>": 51348,
+  "<|19.68|>": 51349,
+  "<|19.70|>": 51350,
+  "<|19.72|>": 51351,
+  "<|19.74|>": 51352,
+  "<|19.76|>": 51353,
+  "<|19.78|>": 51354,
+  "<|19.80|>": 51355,
+  "<|19.82|>": 51356,
+  "<|19.84|>": 51357,
+  "<|19.86|>": 51358,
+  "<|19.88|>": 51359,
+  "<|19.90|>": 51360,
+  "<|19.92|>": 51361,
+  "<|19.94|>": 51362,
+  "<|19.96|>": 51363,
+  "<|19.98|>": 51364,
+  "<|2.00|>": 50465,
+  "<|2.02|>": 50466,
+  "<|2.04|>": 50467,
+  "<|2.06|>": 50468,
+  "<|2.08|>": 50469,
+  "<|2.10|>": 50470,
+  "<|2.12|>": 50471,
+  "<|2.14|>": 50472,
+  "<|2.16|>": 50473,
+  "<|2.18|>": 50474,
+  "<|2.20|>": 50475,
+  "<|2.22|>": 50476,
+  "<|2.24|>": 50477,
+  "<|2.26|>": 50478,
+  "<|2.28|>": 50479,
+  "<|2.30|>": 50480,
+  "<|2.32|>": 50481,
+  "<|2.34|>": 50482,
+  "<|2.36|>": 50483,
+  "<|2.38|>": 50484,
+  "<|2.40|>": 50485,
+  "<|2.42|>": 50486,
+  "<|2.44|>": 50487,
+  "<|2.46|>": 50488,
+  "<|2.48|>": 50489,
+  "<|2.50|>": 50490,
+  "<|2.52|>": 50491,
+  "<|2.54|>": 50492,
+  "<|2.56|>": 50493,
+  "<|2.58|>": 50494,
+  "<|2.60|>": 50495,
+  "<|2.62|>": 50496,
+  "<|2.64|>": 50497,
+  "<|2.66|>": 50498,
+  "<|2.68|>": 50499,
+  "<|2.70|>": 50500,
+  "<|2.72|>": 50501,
+  "<|2.74|>": 50502,
+  "<|2.76|>": 50503,
+  "<|2.78|>": 50504,
+  "<|2.80|>": 50505,
+  "<|2.82|>": 50506,
+  "<|2.84|>": 50507,
+  "<|2.86|>": 50508,
+  "<|2.88|>": 50509,
+  "<|2.90|>": 50510,
+  "<|2.92|>": 50511,
+  "<|2.94|>": 50512,
+  "<|2.96|>": 50513,
+  "<|2.98|>": 50514,
+  "<|20.00|>": 51365,
+  "<|20.02|>": 51366,
+  "<|20.04|>": 51367,
+  "<|20.06|>": 51368,
+  "<|20.08|>": 51369,
+  "<|20.10|>": 51370,
+  "<|20.12|>": 51371,
+  "<|20.14|>": 51372,
+  "<|20.16|>": 51373,
+  "<|20.18|>": 51374,
+  "<|20.20|>": 51375,
+  "<|20.22|>": 51376,
+  "<|20.24|>": 51377,
+  "<|20.26|>": 51378,
+  "<|20.28|>": 51379,
+  "<|20.30|>": 51380,
+  "<|20.32|>": 51381,
+  "<|20.34|>": 51382,
+  "<|20.36|>": 51383,
+  "<|20.38|>": 51384,
+  "<|20.40|>": 51385,
+  "<|20.42|>": 51386,
+  "<|20.44|>": 51387,
+  "<|20.46|>": 51388,
+  "<|20.48|>": 51389,
+  "<|20.50|>": 51390,
+  "<|20.52|>": 51391,
+  "<|20.54|>": 51392,
+  "<|20.56|>": 51393,
+  "<|20.58|>": 51394,
+  "<|20.60|>": 51395,
+  "<|20.62|>": 51396,
+  "<|20.64|>": 51397,
+  "<|20.66|>": 51398,
+  "<|20.68|>": 51399,
+  "<|20.70|>": 51400,
+  "<|20.72|>": 51401,
+  "<|20.74|>": 51402,
+  "<|20.76|>": 51403,
+  "<|20.78|>": 51404,
+  "<|20.80|>": 51405,
+  "<|20.82|>": 51406,
+  "<|20.84|>": 51407,
+  "<|20.86|>": 51408,
+  "<|20.88|>": 51409,
+  "<|20.90|>": 51410,
+  "<|20.92|>": 51411,
+  "<|20.94|>": 51412,
+  "<|20.96|>": 51413,
+  "<|20.98|>": 51414,
+  "<|21.00|>": 51415,
+  "<|21.02|>": 51416,
+  "<|21.04|>": 51417,
+  "<|21.06|>": 51418,
+  "<|21.08|>": 51419,
+  "<|21.10|>": 51420,
+  "<|21.12|>": 51421,
+  "<|21.14|>": 51422,
+  "<|21.16|>": 51423,
+  "<|21.18|>": 51424,
+  "<|21.20|>": 51425,
+  "<|21.22|>": 51426,
+  "<|21.24|>": 51427,
+  "<|21.26|>": 51428,
+  "<|21.28|>": 51429,
+  "<|21.30|>": 51430,
+  "<|21.32|>": 51431,
+  "<|21.34|>": 51432,
+  "<|21.36|>": 51433,
+  "<|21.38|>": 51434,
+  "<|21.40|>": 51435,
+  "<|21.42|>": 51436,
+  "<|21.44|>": 51437,
+  "<|21.46|>": 51438,
+  "<|21.48|>": 51439,
+  "<|21.50|>": 51440,
+  "<|21.52|>": 51441,
+  "<|21.54|>": 51442,
+  "<|21.56|>": 51443,
+  "<|21.58|>": 51444,
+  "<|21.60|>": 51445,
+  "<|21.62|>": 51446,
+  "<|21.64|>": 51447,
+  "<|21.66|>": 51448,
+  "<|21.68|>": 51449,
+  "<|21.70|>": 51450,
+  "<|21.72|>": 51451,
+  "<|21.74|>": 51452,
+  "<|21.76|>": 51453,
+  "<|21.78|>": 51454,
+  "<|21.80|>": 51455,
+  "<|21.82|>": 51456,
+  "<|21.84|>": 51457,
+  "<|21.86|>": 51458,
+  "<|21.88|>": 51459,
+  "<|21.90|>": 51460,
+  "<|21.92|>": 51461,
+  "<|21.94|>": 51462,
+  "<|21.96|>": 51463,
+  "<|21.98|>": 51464,
+  "<|22.00|>": 51465,
+  "<|22.02|>": 51466,
+  "<|22.04|>": 51467,
+  "<|22.06|>": 51468,
+  "<|22.08|>": 51469,
+  "<|22.10|>": 51470,
+  "<|22.12|>": 51471,
+  "<|22.14|>": 51472,
+  "<|22.16|>": 51473,
+  "<|22.18|>": 51474,
+  "<|22.20|>": 51475,
+  "<|22.22|>": 51476,
+  "<|22.24|>": 51477,
+  "<|22.26|>": 51478,
+  "<|22.28|>": 51479,
+  "<|22.30|>": 51480,
+  "<|22.32|>": 51481,
+  "<|22.34|>": 51482,
+  "<|22.36|>": 51483,
+  "<|22.38|>": 51484,
+  "<|22.40|>": 51485,
+  "<|22.42|>": 51486,
+  "<|22.44|>": 51487,
+  "<|22.46|>": 51488,
+  "<|22.48|>": 51489,
+  "<|22.50|>": 51490,
+  "<|22.52|>": 51491,
+  "<|22.54|>": 51492,
+  "<|22.56|>": 51493,
+  "<|22.58|>": 51494,
+  "<|22.60|>": 51495,
+  "<|22.62|>": 51496,
+  "<|22.64|>": 51497,
+  "<|22.66|>": 51498,
+  "<|22.68|>": 51499,
+  "<|22.70|>": 51500,
+  "<|22.72|>": 51501,
+  "<|22.74|>": 51502,
+  "<|22.76|>": 51503,
+  "<|22.78|>": 51504,
+  "<|22.80|>": 51505,
+  "<|22.82|>": 51506,
+  "<|22.84|>": 51507,
+  "<|22.86|>": 51508,
+  "<|22.88|>": 51509,
+  "<|22.90|>": 51510,
+  "<|22.92|>": 51511,
+  "<|22.94|>": 51512,
+  "<|22.96|>": 51513,
+  "<|22.98|>": 51514,
+  "<|23.00|>": 51515,
+  "<|23.02|>": 51516,
+  "<|23.04|>": 51517,
+  "<|23.06|>": 51518,
+  "<|23.08|>": 51519,
+  "<|23.10|>": 51520,
+  "<|23.12|>": 51521,
+  "<|23.14|>": 51522,
+  "<|23.16|>": 51523,
+  "<|23.18|>": 51524,
+  "<|23.20|>": 51525,
+  "<|23.22|>": 51526,
+  "<|23.24|>": 51527,
+  "<|23.26|>": 51528,
+  "<|23.28|>": 51529,
+  "<|23.30|>": 51530,
+  "<|23.32|>": 51531,
+  "<|23.34|>": 51532,
+  "<|23.36|>": 51533,
+  "<|23.38|>": 51534,
+  "<|23.40|>": 51535,
+  "<|23.42|>": 51536,
+  "<|23.44|>": 51537,
+  "<|23.46|>": 51538,
+  "<|23.48|>": 51539,
+  "<|23.50|>": 51540,
+  "<|23.52|>": 51541,
+  "<|23.54|>": 51542,
+  "<|23.56|>": 51543,
+  "<|23.58|>": 51544,
+  "<|23.60|>": 51545,
+  "<|23.62|>": 51546,
+  "<|23.64|>": 51547,
+  "<|23.66|>": 51548,
+  "<|23.68|>": 51549,
+  "<|23.70|>": 51550,
+  "<|23.72|>": 51551,
+  "<|23.74|>": 51552,
+  "<|23.76|>": 51553,
+  "<|23.78|>": 51554,
+  "<|23.80|>": 51555,
+  "<|23.82|>": 51556,
+  "<|23.84|>": 51557,
+  "<|23.86|>": 51558,
+  "<|23.88|>": 51559,
+  "<|23.90|>": 51560,
+  "<|23.92|>": 51561,
+  "<|23.94|>": 51562,
+  "<|23.96|>": 51563,
+  "<|23.98|>": 51564,
+  "<|24.00|>": 51565,
+  "<|24.02|>": 51566,
+  "<|24.04|>": 51567,
+  "<|24.06|>": 51568,
+  "<|24.08|>": 51569,
+  "<|24.10|>": 51570,
+  "<|24.12|>": 51571,
+  "<|24.14|>": 51572,
+  "<|24.16|>": 51573,
+  "<|24.18|>": 51574,
+  "<|24.20|>": 51575,
+  "<|24.22|>": 51576,
+  "<|24.24|>": 51577,
+  "<|24.26|>": 51578,
+  "<|24.28|>": 51579,
+  "<|24.30|>": 51580,
+  "<|24.32|>": 51581,
+  "<|24.34|>": 51582,
+  "<|24.36|>": 51583,
+  "<|24.38|>": 51584,
+  "<|24.40|>": 51585,
+  "<|24.42|>": 51586,
+  "<|24.44|>": 51587,
+  "<|24.46|>": 51588,
+  "<|24.48|>": 51589,
+  "<|24.50|>": 51590,
+  "<|24.52|>": 51591,
+  "<|24.54|>": 51592,
+  "<|24.56|>": 51593,
+  "<|24.58|>": 51594,
+  "<|24.60|>": 51595,
+  "<|24.62|>": 51596,
+  "<|24.64|>": 51597,
+  "<|24.66|>": 51598,
+  "<|24.68|>": 51599,
+  "<|24.70|>": 51600,
+  "<|24.72|>": 51601,
+  "<|24.74|>": 51602,
+  "<|24.76|>": 51603,
+  "<|24.78|>": 51604,
+  "<|24.80|>": 51605,
+  "<|24.82|>": 51606,
+  "<|24.84|>": 51607,
+  "<|24.86|>": 51608,
+  "<|24.88|>": 51609,
+  "<|24.90|>": 51610,
+  "<|24.92|>": 51611,
+  "<|24.94|>": 51612,
+  "<|24.96|>": 51613,
+  "<|24.98|>": 51614,
+  "<|25.00|>": 51615,
+  "<|25.02|>": 51616,
+  "<|25.04|>": 51617,
+  "<|25.06|>": 51618,
+  "<|25.08|>": 51619,
+  "<|25.10|>": 51620,
+  "<|25.12|>": 51621,
+  "<|25.14|>": 51622,
+  "<|25.16|>": 51623,
+  "<|25.18|>": 51624,
+  "<|25.20|>": 51625,
+  "<|25.22|>": 51626,
+  "<|25.24|>": 51627,
+  "<|25.26|>": 51628,
+  "<|25.28|>": 51629,
+  "<|25.30|>": 51630,
+  "<|25.32|>": 51631,
+  "<|25.34|>": 51632,
+  "<|25.36|>": 51633,
+  "<|25.38|>": 51634,
+  "<|25.40|>": 51635,
+  "<|25.42|>": 51636,
+  "<|25.44|>": 51637,
+  "<|25.46|>": 51638,
+  "<|25.48|>": 51639,
+  "<|25.50|>": 51640,
+  "<|25.52|>": 51641,
+  "<|25.54|>": 51642,
+  "<|25.56|>": 51643,
+  "<|25.58|>": 51644,
+  "<|25.60|>": 51645,
+  "<|25.62|>": 51646,
+  "<|25.64|>": 51647,
+  "<|25.66|>": 51648,
+  "<|25.68|>": 51649,
+  "<|25.70|>": 51650,
+  "<|25.72|>": 51651,
+  "<|25.74|>": 51652,
+  "<|25.76|>": 51653,
+  "<|25.78|>": 51654,
+  "<|25.80|>": 51655,
+  "<|25.82|>": 51656,
+  "<|25.84|>": 51657,
+  "<|25.86|>": 51658,
+  "<|25.88|>": 51659,
+  "<|25.90|>": 51660,
+  "<|25.92|>": 51661,
+  "<|25.94|>": 51662,
+  "<|25.96|>": 51663,
+  "<|25.98|>": 51664,
+  "<|26.00|>": 51665,
+  "<|26.02|>": 51666,
+  "<|26.04|>": 51667,
+  "<|26.06|>": 51668,
+  "<|26.08|>": 51669,
+  "<|26.10|>": 51670,
+  "<|26.12|>": 51671,
+  "<|26.14|>": 51672,
+  "<|26.16|>": 51673,
+  "<|26.18|>": 51674,
+  "<|26.20|>": 51675,
+  "<|26.22|>": 51676,
+  "<|26.24|>": 51677,
+  "<|26.26|>": 51678,
+  "<|26.28|>": 51679,
+  "<|26.30|>": 51680,
+  "<|26.32|>": 51681,
+  "<|26.34|>": 51682,
+  "<|26.36|>": 51683,
+  "<|26.38|>": 51684,
+  "<|26.40|>": 51685,
+  "<|26.42|>": 51686,
+  "<|26.44|>": 51687,
+  "<|26.46|>": 51688,
+  "<|26.48|>": 51689,
+  "<|26.50|>": 51690,
+  "<|26.52|>": 51691,
+  "<|26.54|>": 51692,
+  "<|26.56|>": 51693,
+  "<|26.58|>": 51694,
+  "<|26.60|>": 51695,
+  "<|26.62|>": 51696,
+  "<|26.64|>": 51697,
+  "<|26.66|>": 51698,
+  "<|26.68|>": 51699,
+  "<|26.70|>": 51700,
+  "<|26.72|>": 51701,
+  "<|26.74|>": 51702,
+  "<|26.76|>": 51703,
+  "<|26.78|>": 51704,
+  "<|26.80|>": 51705,
+  "<|26.82|>": 51706,
+  "<|26.84|>": 51707,
+  "<|26.86|>": 51708,
+  "<|26.88|>": 51709,
+  "<|26.90|>": 51710,
+  "<|26.92|>": 51711,
+  "<|26.94|>": 51712,
+  "<|26.96|>": 51713,
+  "<|26.98|>": 51714,
+  "<|27.00|>": 51715,
+  "<|27.02|>": 51716,
+  "<|27.04|>": 51717,
+  "<|27.06|>": 51718,
+  "<|27.08|>": 51719,
+  "<|27.10|>": 51720,
+  "<|27.12|>": 51721,
+  "<|27.14|>": 51722,
+  "<|27.16|>": 51723,
+  "<|27.18|>": 51724,
+  "<|27.20|>": 51725,
+  "<|27.22|>": 51726,
+  "<|27.24|>": 51727,
+  "<|27.26|>": 51728,
+  "<|27.28|>": 51729,
+  "<|27.30|>": 51730,
+  "<|27.32|>": 51731,
+  "<|27.34|>": 51732,
+  "<|27.36|>": 51733,
+  "<|27.38|>": 51734,
+  "<|27.40|>": 51735,
+  "<|27.42|>": 51736,
+  "<|27.44|>": 51737,
+  "<|27.46|>": 51738,
+  "<|27.48|>": 51739,
+  "<|27.50|>": 51740,
+  "<|27.52|>": 51741,
+  "<|27.54|>": 51742,
+  "<|27.56|>": 51743,
+  "<|27.58|>": 51744,
+  "<|27.60|>": 51745,
+  "<|27.62|>": 51746,
+  "<|27.64|>": 51747,
+  "<|27.66|>": 51748,
+  "<|27.68|>": 51749,
+  "<|27.70|>": 51750,
+  "<|27.72|>": 51751,
+  "<|27.74|>": 51752,
+  "<|27.76|>": 51753,
+  "<|27.78|>": 51754,
+  "<|27.80|>": 51755,
+  "<|27.82|>": 51756,
+  "<|27.84|>": 51757,
+  "<|27.86|>": 51758,
+  "<|27.88|>": 51759,
+  "<|27.90|>": 51760,
+  "<|27.92|>": 51761,
+  "<|27.94|>": 51762,
+  "<|27.96|>": 51763,
+  "<|27.98|>": 51764,
+  "<|28.00|>": 51765,
+  "<|28.02|>": 51766,
+  "<|28.04|>": 51767,
+  "<|28.06|>": 51768,
+  "<|28.08|>": 51769,
+  "<|28.10|>": 51770,
+  "<|28.12|>": 51771,
+  "<|28.14|>": 51772,
+  "<|28.16|>": 51773,
+  "<|28.18|>": 51774,
+  "<|28.20|>": 51775,
+  "<|28.22|>": 51776,
+  "<|28.24|>": 51777,
+  "<|28.26|>": 51778,
+  "<|28.28|>": 51779,
+  "<|28.30|>": 51780,
+  "<|28.32|>": 51781,
+  "<|28.34|>": 51782,
+  "<|28.36|>": 51783,
+  "<|28.38|>": 51784,
+  "<|28.40|>": 51785,
+  "<|28.42|>": 51786,
+  "<|28.44|>": 51787,
+  "<|28.46|>": 51788,
+  "<|28.48|>": 51789,
+  "<|28.50|>": 51790,
+  "<|28.52|>": 51791,
+  "<|28.54|>": 51792,
+  "<|28.56|>": 51793,
+  "<|28.58|>": 51794,
+  "<|28.60|>": 51795,
+  "<|28.62|>": 51796,
+  "<|28.64|>": 51797,
+  "<|28.66|>": 51798,
+  "<|28.68|>": 51799,
+  "<|28.70|>": 51800,
+  "<|28.72|>": 51801,
+  "<|28.74|>": 51802,
+  "<|28.76|>": 51803,
+  "<|28.78|>": 51804,
+  "<|28.80|>": 51805,
+  "<|28.82|>": 51806,
+  "<|28.84|>": 51807,
+  "<|28.86|>": 51808,
+  "<|28.88|>": 51809,
+  "<|28.90|>": 51810,
+  "<|28.92|>": 51811,
+  "<|28.94|>": 51812,
+  "<|28.96|>": 51813,
+  "<|28.98|>": 51814,
+  "<|29.00|>": 51815,
+  "<|29.02|>": 51816,
+  "<|29.04|>": 51817,
+  "<|29.06|>": 51818,
+  "<|29.08|>": 51819,
+  "<|29.10|>": 51820,
+  "<|29.12|>": 51821,
+  "<|29.14|>": 51822,
+  "<|29.16|>": 51823,
+  "<|29.18|>": 51824,
+  "<|29.20|>": 51825,
+  "<|29.22|>": 51826,
+  "<|29.24|>": 51827,
+  "<|29.26|>": 51828,
+  "<|29.28|>": 51829,
+  "<|29.30|>": 51830,
+  "<|29.32|>": 51831,
+  "<|29.34|>": 51832,
+  "<|29.36|>": 51833,
+  "<|29.38|>": 51834,
+  "<|29.40|>": 51835,
+  "<|29.42|>": 51836,
+  "<|29.44|>": 51837,
+  "<|29.46|>": 51838,
+  "<|29.48|>": 51839,
+  "<|29.50|>": 51840,
+  "<|29.52|>": 51841,
+  "<|29.54|>": 51842,
+  "<|29.56|>": 51843,
+  "<|29.58|>": 51844,
+  "<|29.60|>": 51845,
+  "<|29.62|>": 51846,
+  "<|29.64|>": 51847,
+  "<|29.66|>": 51848,
+  "<|29.68|>": 51849,
+  "<|29.70|>": 51850,
+  "<|29.72|>": 51851,
+  "<|29.74|>": 51852,
+  "<|29.76|>": 51853,
+  "<|29.78|>": 51854,
+  "<|29.80|>": 51855,
+  "<|29.82|>": 51856,
+  "<|29.84|>": 51857,
+  "<|29.86|>": 51858,
+  "<|29.88|>": 51859,
+  "<|29.90|>": 51860,
+  "<|29.92|>": 51861,
+  "<|29.94|>": 51862,
+  "<|29.96|>": 51863,
+  "<|29.98|>": 51864,
+  "<|3.00|>": 50515,
+  "<|3.02|>": 50516,
+  "<|3.04|>": 50517,
+  "<|3.06|>": 50518,
+  "<|3.08|>": 50519,
+  "<|3.10|>": 50520,
+  "<|3.12|>": 50521,
+  "<|3.14|>": 50522,
+  "<|3.16|>": 50523,
+  "<|3.18|>": 50524,
+  "<|3.20|>": 50525,
+  "<|3.22|>": 50526,
+  "<|3.24|>": 50527,
+  "<|3.26|>": 50528,
+  "<|3.28|>": 50529,
+  "<|3.30|>": 50530,
+  "<|3.32|>": 50531,
+  "<|3.34|>": 50532,
+  "<|3.36|>": 50533,
+  "<|3.38|>": 50534,
+  "<|3.40|>": 50535,
+  "<|3.42|>": 50536,
+  "<|3.44|>": 50537,
+  "<|3.46|>": 50538,
+  "<|3.48|>": 50539,
+  "<|3.50|>": 50540,
+  "<|3.52|>": 50541,
+  "<|3.54|>": 50542,
+  "<|3.56|>": 50543,
+  "<|3.58|>": 50544,
+  "<|3.60|>": 50545,
+  "<|3.62|>": 50546,
+  "<|3.64|>": 50547,
+  "<|3.66|>": 50548,
+  "<|3.68|>": 50549,
+  "<|3.70|>": 50550,
+  "<|3.72|>": 50551,
+  "<|3.74|>": 50552,
+  "<|3.76|>": 50553,
+  "<|3.78|>": 50554,
+  "<|3.80|>": 50555,
+  "<|3.82|>": 50556,
+  "<|3.84|>": 50557,
+  "<|3.86|>": 50558,
+  "<|3.88|>": 50559,
+  "<|3.90|>": 50560,
+  "<|3.92|>": 50561,
+  "<|3.94|>": 50562,
+  "<|3.96|>": 50563,
+  "<|3.98|>": 50564,
+  "<|30.00|>": 51865,
+  "<|4.00|>": 50565,
+  "<|4.02|>": 50566,
+  "<|4.04|>": 50567,
+  "<|4.06|>": 50568,
+  "<|4.08|>": 50569,
+  "<|4.10|>": 50570,
+  "<|4.12|>": 50571,
+  "<|4.14|>": 50572,
+  "<|4.16|>": 50573,
+  "<|4.18|>": 50574,
+  "<|4.20|>": 50575,
+  "<|4.22|>": 50576,
+  "<|4.24|>": 50577,
+  "<|4.26|>": 50578,
+  "<|4.28|>": 50579,
+  "<|4.30|>": 50580,
+  "<|4.32|>": 50581,
+  "<|4.34|>": 50582,
+  "<|4.36|>": 50583,
+  "<|4.38|>": 50584,
+  "<|4.40|>": 50585,
+  "<|4.42|>": 50586,
+  "<|4.44|>": 50587,
+  "<|4.46|>": 50588,
+  "<|4.48|>": 50589,
+  "<|4.50|>": 50590,
+  "<|4.52|>": 50591,
+  "<|4.54|>": 50592,
+  "<|4.56|>": 50593,
+  "<|4.58|>": 50594,
+  "<|4.60|>": 50595,
+  "<|4.62|>": 50596,
+  "<|4.64|>": 50597,
+  "<|4.66|>": 50598,
+  "<|4.68|>": 50599,
+  "<|4.70|>": 50600,
+  "<|4.72|>": 50601,
+  "<|4.74|>": 50602,
+  "<|4.76|>": 50603,
+  "<|4.78|>": 50604,
+  "<|4.80|>": 50605,
+  "<|4.82|>": 50606,
+  "<|4.84|>": 50607,
+  "<|4.86|>": 50608,
+  "<|4.88|>": 50609,
+  "<|4.90|>": 50610,
+  "<|4.92|>": 50611,
+  "<|4.94|>": 50612,
+  "<|4.96|>": 50613,
+  "<|4.98|>": 50614,
+  "<|5.00|>": 50615,
+  "<|5.02|>": 50616,
+  "<|5.04|>": 50617,
+  "<|5.06|>": 50618,
+  "<|5.08|>": 50619,
+  "<|5.10|>": 50620,
+  "<|5.12|>": 50621,
+  "<|5.14|>": 50622,
+  "<|5.16|>": 50623,
+  "<|5.18|>": 50624,
+  "<|5.20|>": 50625,
+  "<|5.22|>": 50626,
+  "<|5.24|>": 50627,
+  "<|5.26|>": 50628,
+  "<|5.28|>": 50629,
+  "<|5.30|>": 50630,
+  "<|5.32|>": 50631,
+  "<|5.34|>": 50632,
+  "<|5.36|>": 50633,
+  "<|5.38|>": 50634,
+  "<|5.40|>": 50635,
+  "<|5.42|>": 50636,
+  "<|5.44|>": 50637,
+  "<|5.46|>": 50638,
+  "<|5.48|>": 50639,
+  "<|5.50|>": 50640,
+  "<|5.52|>": 50641,
+  "<|5.54|>": 50642,
+  "<|5.56|>": 50643,
+  "<|5.58|>": 50644,
+  "<|5.60|>": 50645,
+  "<|5.62|>": 50646,
+  "<|5.64|>": 50647,
+  "<|5.66|>": 50648,
+  "<|5.68|>": 50649,
+  "<|5.70|>": 50650,
+  "<|5.72|>": 50651,
+  "<|5.74|>": 50652,
+  "<|5.76|>": 50653,
+  "<|5.78|>": 50654,
+  "<|5.80|>": 50655,
+  "<|5.82|>": 50656,
+  "<|5.84|>": 50657,
+  "<|5.86|>": 50658,
+  "<|5.88|>": 50659,
+  "<|5.90|>": 50660,
+  "<|5.92|>": 50661,
+  "<|5.94|>": 50662,
+  "<|5.96|>": 50663,
+  "<|5.98|>": 50664,
+  "<|6.00|>": 50665,
+  "<|6.02|>": 50666,
+  "<|6.04|>": 50667,
+  "<|6.06|>": 50668,
+  "<|6.08|>": 50669,
+  "<|6.10|>": 50670,
+  "<|6.12|>": 50671,
+  "<|6.14|>": 50672,
+  "<|6.16|>": 50673,
+  "<|6.18|>": 50674,
+  "<|6.20|>": 50675,
+  "<|6.22|>": 50676,
+  "<|6.24|>": 50677,
+  "<|6.26|>": 50678,
+  "<|6.28|>": 50679,
+  "<|6.30|>": 50680,
+  "<|6.32|>": 50681,
+  "<|6.34|>": 50682,
+  "<|6.36|>": 50683,
+  "<|6.38|>": 50684,
+  "<|6.40|>": 50685,
+  "<|6.42|>": 50686,
+  "<|6.44|>": 50687,
+  "<|6.46|>": 50688,
+  "<|6.48|>": 50689,
+  "<|6.50|>": 50690,
+  "<|6.52|>": 50691,
+  "<|6.54|>": 50692,
+  "<|6.56|>": 50693,
+  "<|6.58|>": 50694,
+  "<|6.60|>": 50695,
+  "<|6.62|>": 50696,
+  "<|6.64|>": 50697,
+  "<|6.66|>": 50698,
+  "<|6.68|>": 50699,
+  "<|6.70|>": 50700,
+  "<|6.72|>": 50701,
+  "<|6.74|>": 50702,
+  "<|6.76|>": 50703,
+  "<|6.78|>": 50704,
+  "<|6.80|>": 50705,
+  "<|6.82|>": 50706,
+  "<|6.84|>": 50707,
+  "<|6.86|>": 50708,
+  "<|6.88|>": 50709,
+  "<|6.90|>": 50710,
+  "<|6.92|>": 50711,
+  "<|6.94|>": 50712,
+  "<|6.96|>": 50713,
+  "<|6.98|>": 50714,
+  "<|7.00|>": 50715,
+  "<|7.02|>": 50716,
+  "<|7.04|>": 50717,
+  "<|7.06|>": 50718,
+  "<|7.08|>": 50719,
+  "<|7.10|>": 50720,
+  "<|7.12|>": 50721,
+  "<|7.14|>": 50722,
+  "<|7.16|>": 50723,
+  "<|7.18|>": 50724,
+  "<|7.20|>": 50725,
+  "<|7.22|>": 50726,
+  "<|7.24|>": 50727,
+  "<|7.26|>": 50728,
+  "<|7.28|>": 50729,
+  "<|7.30|>": 50730,
+  "<|7.32|>": 50731,
+  "<|7.34|>": 50732,
+  "<|7.36|>": 50733,
+  "<|7.38|>": 50734,
+  "<|7.40|>": 50735,
+  "<|7.42|>": 50736,
+  "<|7.44|>": 50737,
+  "<|7.46|>": 50738,
+  "<|7.48|>": 50739,
+  "<|7.50|>": 50740,
+  "<|7.52|>": 50741,
+  "<|7.54|>": 50742,
+  "<|7.56|>": 50743,
+  "<|7.58|>": 50744,
+  "<|7.60|>": 50745,
+  "<|7.62|>": 50746,
+  "<|7.64|>": 50747,
+  "<|7.66|>": 50748,
+  "<|7.68|>": 50749,
+  "<|7.70|>": 50750,
+  "<|7.72|>": 50751,
+  "<|7.74|>": 50752,
+  "<|7.76|>": 50753,
+  "<|7.78|>": 50754,
+  "<|7.80|>": 50755,
+  "<|7.82|>": 50756,
+  "<|7.84|>": 50757,
+  "<|7.86|>": 50758,
+  "<|7.88|>": 50759,
+  "<|7.90|>": 50760,
+  "<|7.92|>": 50761,
+  "<|7.94|>": 50762,
+  "<|7.96|>": 50763,
+  "<|7.98|>": 50764,
+  "<|8.00|>": 50765,
+  "<|8.02|>": 50766,
+  "<|8.04|>": 50767,
+  "<|8.06|>": 50768,
+  "<|8.08|>": 50769,
+  "<|8.10|>": 50770,
+  "<|8.12|>": 50771,
+  "<|8.14|>": 50772,
+  "<|8.16|>": 50773,
+  "<|8.18|>": 50774,
+  "<|8.20|>": 50775,
+  "<|8.22|>": 50776,
+  "<|8.24|>": 50777,
+  "<|8.26|>": 50778,
+  "<|8.28|>": 50779,
+  "<|8.30|>": 50780,
+  "<|8.32|>": 50781,
+  "<|8.34|>": 50782,
+  "<|8.36|>": 50783,
+  "<|8.38|>": 50784,
+  "<|8.40|>": 50785,
+  "<|8.42|>": 50786,
+  "<|8.44|>": 50787,
+  "<|8.46|>": 50788,
+  "<|8.48|>": 50789,
+  "<|8.50|>": 50790,
+  "<|8.52|>": 50791,
+  "<|8.54|>": 50792,
+  "<|8.56|>": 50793,
+  "<|8.58|>": 50794,
+  "<|8.60|>": 50795,
+  "<|8.62|>": 50796,
+  "<|8.64|>": 50797,
+  "<|8.66|>": 50798,
+  "<|8.68|>": 50799,
+  "<|8.70|>": 50800,
+  "<|8.72|>": 50801,
+  "<|8.74|>": 50802,
+  "<|8.76|>": 50803,
+  "<|8.78|>": 50804,
+  "<|8.80|>": 50805,
+  "<|8.82|>": 50806,
+  "<|8.84|>": 50807,
+  "<|8.86|>": 50808,
+  "<|8.88|>": 50809,
+  "<|8.90|>": 50810,
+  "<|8.92|>": 50811,
+  "<|8.94|>": 50812,
+  "<|8.96|>": 50813,
+  "<|8.98|>": 50814,
+  "<|9.00|>": 50815,
+  "<|9.02|>": 50816,
+  "<|9.04|>": 50817,
+  "<|9.06|>": 50818,
+  "<|9.08|>": 50819,
+  "<|9.10|>": 50820,
+  "<|9.12|>": 50821,
+  "<|9.14|>": 50822,
+  "<|9.16|>": 50823,
+  "<|9.18|>": 50824,
+  "<|9.20|>": 50825,
+  "<|9.22|>": 50826,
+  "<|9.24|>": 50827,
+  "<|9.26|>": 50828,
+  "<|9.28|>": 50829,
+  "<|9.30|>": 50830,
+  "<|9.32|>": 50831,
+  "<|9.34|>": 50832,
+  "<|9.36|>": 50833,
+  "<|9.38|>": 50834,
+  "<|9.40|>": 50835,
+  "<|9.42|>": 50836,
+  "<|9.44|>": 50837,
+  "<|9.46|>": 50838,
+  "<|9.48|>": 50839,
+  "<|9.50|>": 50840,
+  "<|9.52|>": 50841,
+  "<|9.54|>": 50842,
+  "<|9.56|>": 50843,
+  "<|9.58|>": 50844,
+  "<|9.60|>": 50845,
+  "<|9.62|>": 50846,
+  "<|9.64|>": 50847,
+  "<|9.66|>": 50848,
+  "<|9.68|>": 50849,
+  "<|9.70|>": 50850,
+  "<|9.72|>": 50851,
+  "<|9.74|>": 50852,
+  "<|9.76|>": 50853,
+  "<|9.78|>": 50854,
+  "<|9.80|>": 50855,
+  "<|9.82|>": 50856,
+  "<|9.84|>": 50857,
+  "<|9.86|>": 50858,
+  "<|9.88|>": 50859,
+  "<|9.90|>": 50860,
+  "<|9.92|>": 50861,
+  "<|9.94|>": 50862,
+  "<|9.96|>": 50863,
+  "<|9.98|>": 50864,
+  "<|af|>": 50327,
+  "<|am|>": 50334,
+  "<|ar|>": 50272,
+  "<|as|>": 50350,
+  "<|az|>": 50304,
+  "<|ba|>": 50355,
+  "<|be|>": 50330,
+  "<|bg|>": 50292,
+  "<|bn|>": 50302,
+  "<|bo|>": 50347,
+  "<|br|>": 50309,
+  "<|bs|>": 50315,
+  "<|ca|>": 50270,
+  "<|cs|>": 50283,
+  "<|cy|>": 50297,
+  "<|da|>": 50285,
+  "<|de|>": 50261,
+  "<|el|>": 50281,
+  "<|endoftext|>": 50257,
+  "<|en|>": 50259,
+  "<|es|>": 50262,
+  "<|et|>": 50307,
+  "<|eu|>": 50310,
+  "<|fa|>": 50300,
+  "<|fi|>": 50277,
+  "<|fo|>": 50338,
+  "<|fr|>": 50265,
+  "<|gl|>": 50319,
+  "<|gu|>": 50333,
+  "<|haw|>": 50352,
+  "<|ha|>": 50354,
+  "<|he|>": 50279,
+  "<|hi|>": 50276,
+  "<|hr|>": 50291,
+  "<|ht|>": 50339,
+  "<|hu|>": 50286,
+  "<|hy|>": 50312,
+  "<|id|>": 50275,
+  "<|is|>": 50311,
+  "<|it|>": 50274,
+  "<|ja|>": 50266,
+  "<|jw|>": 50356,
+  "<|ka|>": 50329,
+  "<|kk|>": 50316,
+  "<|km|>": 50323,
+  "<|kn|>": 50306,
+  "<|ko|>": 50264,
+  "<|la|>": 50294,
+  "<|lb|>": 50345,
+  "<|ln|>": 50353,
+  "<|lo|>": 50336,
+  "<|lt|>": 50293,
+  "<|lv|>": 50301,
+  "<|mg|>": 50349,
+  "<|mi|>": 50295,
+  "<|mk|>": 50308,
+  "<|ml|>": 50296,
+  "<|mn|>": 50314,
+  "<|mr|>": 50320,
+  "<|ms|>": 50282,
+  "<|mt|>": 50343,
+  "<|my|>": 50346,
+  "<|ne|>": 50313,
+  "<|nl|>": 50271,
+  "<|nn|>": 50342,
+  "<|nospeech|>": 50363,
+  "<|notimestamps|>": 50364,
+  "<|no|>": 50288,
+  "<|oc|>": 50328,
+  "<|pa|>": 50321,
+  "<|pl|>": 50269,
+  "<|ps|>": 50340,
+  "<|pt|>": 50267,
+  "<|ro|>": 50284,
+  "<|ru|>": 50263,
+  "<|sa|>": 50344,
+  "<|sd|>": 50332,
+  "<|si|>": 50322,
+  "<|sk|>": 50298,
+  "<|sl|>": 50305,
+  "<|sn|>": 50324,
+  "<|so|>": 50326,
+  "<|sq|>": 50317,
+  "<|sr|>": 50303,
+  "<|startoflm|>": 50361,
+  "<|startofprev|>": 50362,
+  "<|startoftranscript|>": 50258,
+  "<|su|>": 50357,
+  "<|sv|>": 50273,
+  "<|sw|>": 50318,
+  "<|ta|>": 50287,
+  "<|te|>": 50299,
+  "<|tg|>": 50331,
+  "<|th|>": 50289,
+  "<|tk|>": 50341,
+  "<|tl|>": 50348,
+  "<|transcribe|>": 50360,
+  "<|translate|>": 50359,
+  "<|tr|>": 50268,
+  "<|tt|>": 50351,
+  "<|uk|>": 50280,
+  "<|ur|>": 50290,
+  "<|uz|>": 50337,
+  "<|vi|>": 50278,
+  "<|yi|>": 50335,
+  "<|yo|>": 50325,
+  "<|yue|>": 50358,
+  "<|zh|>": 50260
+}

nb-distil-large-init/config.json ADDED Viewed

	@@ -0,0 +1,288 @@

+{
+  "_name_or_path": "./",
+  "activation_dropout": 0.1,
+  "activation_function": "gelu",
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "lang_ids": [
+    50259,
+    50260,
+    50261,
+    50262,
+    50263,
+    50264,
+    50265,
+    50266,
+    50267,
+    50268,
+    50269,
+    50270,
+    50271,
+    50272,
+    50273,
+    50274,
+    50275,
+    50276,
+    50277,
+    50278,
+    50279,
+    50280,
+    50281,
+    50282,
+    50283,
+    50284,
+    50285,
+    50286,
+    50287,
+    50288,
+    50289,
+    50290,
+    50291,
+    50292,
+    50293,
+    50294,
+    50295,
+    50296,
+    50297,
+    50298,
+    50299,
+    50300,
+    50301,
+    50302,
+    50303,
+    50304,
+    50305,
+    50306,
+    50307,
+    50308,
+    50309,
+    50310,
+    50311,
+    50312,
+    50313,
+    50314,
+    50315,
+    50316,
+    50317,
+    50318,
+    50319,
+    50320,
+    50321,
+    50322,
+    50323,
+    50324,
+    50325,
+    50326,
+    50327,
+    50328,
+    50329,
+    50330,
+    50331,
+    50332,
+    50333,
+    50334,
+    50335,
+    50336,
+    50337,
+    50338,
+    50339,
+    50340,
+    50341,
+    50342,
+    50343,
+    50344,
+    50345,
+    50346,
+    50347,
+    50348,
+    50349,
+    50350,
+    50351,
+    50352,
+    50353,
+    50354,
+    50355,
+    50356,
+    50357,
+    50358
+  ],
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "pad_token_id": 50256,
+  "scale_embedding": false,
+  "suppress_ids": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "suppress_ids_begin": [
+    220,
+    50257
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.1",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51866
+}

nb-distil-large-init/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60f608eb7887b643bfb0d6b11d3ad8564c648c296a90c1e558aa61075b1f2839
+size 1512831199

nb-distil-large-init/generation_config.json ADDED Viewed

	@@ -0,0 +1,270 @@

+{
+  "alignment_heads": [
+    [
+      7,
+      0
+    ],
+    [
+      10,
+      17
+    ],
+    [
+      12,
+      18
+    ],
+    [
+      13,
+      12
+    ],
+    [
+      16,
+      1
+    ],
+    [
+      17,
+      14
+    ],
+    [
+      19,
+      11
+    ],
+    [
+      21,
+      4
+    ],
+    [
+      24,
+      1
+    ],
+    [
+      25,
+      6
+    ]
+  ],
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": [
+    [
+      1,
+      50288
+    ],
+    [
+      2,
+      50360
+    ],
+    [
+      3,
+      50364
+    ]
+  ],
+  "is_multilingual": true,
+  "lang_to_id": {
+    "<|af|>": 50327,
+    "<|am|>": 50334,
+    "<|ar|>": 50272,
+    "<|as|>": 50350,
+    "<|az|>": 50304,
+    "<|ba|>": 50355,
+    "<|be|>": 50330,
+    "<|bg|>": 50292,
+    "<|bn|>": 50302,
+    "<|bo|>": 50347,
+    "<|br|>": 50309,
+    "<|bs|>": 50315,
+    "<|ca|>": 50270,
+    "<|cs|>": 50283,
+    "<|cy|>": 50297,
+    "<|da|>": 50285,
+    "<|de|>": 50261,
+    "<|el|>": 50281,
+    "<|en|>": 50259,
+    "<|es|>": 50262,
+    "<|et|>": 50307,
+    "<|eu|>": 50310,
+    "<|fa|>": 50300,
+    "<|fi|>": 50277,
+    "<|fo|>": 50338,
+    "<|fr|>": 50265,
+    "<|gl|>": 50319,
+    "<|gu|>": 50333,
+    "<|haw|>": 50352,
+    "<|ha|>": 50354,
+    "<|he|>": 50279,
+    "<|hi|>": 50276,
+    "<|hr|>": 50291,
+    "<|ht|>": 50339,
+    "<|hu|>": 50286,
+    "<|hy|>": 50312,
+    "<|id|>": 50275,
+    "<|is|>": 50311,
+    "<|it|>": 50274,
+    "<|ja|>": 50266,
+    "<|jw|>": 50356,
+    "<|ka|>": 50329,
+    "<|kk|>": 50316,
+    "<|km|>": 50323,
+    "<|kn|>": 50306,
+    "<|ko|>": 50264,
+    "<|la|>": 50294,
+    "<|lb|>": 50345,
+    "<|ln|>": 50353,
+    "<|lo|>": 50336,
+    "<|lt|>": 50293,
+    "<|lv|>": 50301,
+    "<|mg|>": 50349,
+    "<|mi|>": 50295,
+    "<|mk|>": 50308,
+    "<|ml|>": 50296,
+    "<|mn|>": 50314,
+    "<|mr|>": 50320,
+    "<|ms|>": 50282,
+    "<|mt|>": 50343,
+    "<|my|>": 50346,
+    "<|ne|>": 50313,
+    "<|nl|>": 50271,
+    "<|nn|>": 50342,
+    "<|no|>": 50288,
+    "<|oc|>": 50328,
+    "<|pa|>": 50321,
+    "<|pl|>": 50269,
+    "<|ps|>": 50340,
+    "<|pt|>": 50267,
+    "<|ro|>": 50284,
+    "<|ru|>": 50263,
+    "<|sa|>": 50344,
+    "<|sd|>": 50332,
+    "<|si|>": 50322,
+    "<|sk|>": 50298,
+    "<|sl|>": 50305,
+    "<|sn|>": 50324,
+    "<|so|>": 50326,
+    "<|sq|>": 50317,
+    "<|sr|>": 50303,
+    "<|su|>": 50357,
+    "<|sv|>": 50273,
+    "<|sw|>": 50318,
+    "<|ta|>": 50287,
+    "<|te|>": 50299,
+    "<|tg|>": 50331,
+    "<|th|>": 50289,
+    "<|tk|>": 50341,
+    "<|tl|>": 50348,
+    "<|tr|>": 50268,
+    "<|tt|>": 50351,
+    "<|uk|>": 50280,
+    "<|ur|>": 50290,
+    "<|uz|>": 50337,
+    "<|vi|>": 50278,
+    "<|yi|>": 50335,
+    "<|yo|>": 50325,
+    "<|yue|>": 50358,
+    "<|zh|>": 50260
+  },
+  "language": "<|no|>",
+  "max_initial_timestamp_index": 1,
+  "max_length": 448,
+  "no_timestamps_token_id": 50364,
+  "pad_token_id": 50257,
+  "return_timestamps": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50359,
+    50360,
+    50361,
+    50362,
+    50363
+  ],
+  "task": "transcribe",
+  "task_to_id": {
+    "transcribe": 50360,
+    "translate": 50359
+  },
+  "transformers_version": "4.46.1"
+}

nb-distil-large-init/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

nb-distil-large-init/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "WhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

nb-distil-large-init/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

nb-distil-large-init/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nb-distil-large-init/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "chunk_length": 30,
+  "feature_extractor_type": "FlaxWhisperFeatureExtractor",
+  "feature_size": 128,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

run_distillation.py ADDED Viewed

	@@ -0,0 +1,2156 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
+"""
+# You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
+import logging
+import os
+import re
+import shutil
+import string
+import sys
+import time
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+import datasets
+import evaluate
+import flax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import transformers
+from datasets import (
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    interleave_datasets,
+    load_dataset,
+)
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.serialization import from_bytes, to_bytes
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository, create_repo
+from jax.experimental.compilation_cache import compilation_cache as cc
+from optax._src import linear_algebra
+from torch.utils.data import DataLoader
+from torchdata.datapipes.iter import IterableWrapper
+from tqdm import tqdm
+from transformers import (
+    AddedToken,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperProcessor,
+    WhisperTokenizerFast,
+    is_tensorboard_available,
+    is_wandb_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer,EnglishTextNormalizer
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from distil_whisper import FlaxWhisperForConditionalGeneration
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.27.0.dev0")
+require_version(
+    "datasets>=1.18.0",
+    "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt",
+)
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained student model or model identifier from huggingface.co/models")}
+    )
+    teacher_model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained teacher model or model identifier from huggingface.co/models")}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "feature extractor name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": ("Where to store the pretrained models downloaded from huggingface.co")},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": ("Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.")},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": ("The specific model version to use (can be a branch name, tag name or commit id).")},
+    )
+    subfolder: str = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `transformers-cli login`"
+                " (necessary to use this script with private models)."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "Floating-point format in which the model weights should be initialized"
+                " and trained. Choose one of `[float32, float16, bfloat16]`."
+            )
+        },
+    )
+    load_with_scan_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the pre-trained checkpoint has its weights stored in scan format. Set to True for scanned "
+            "weights, defaults to False for non-scan (unrolled) weights."
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for activations inside the fully connected layer."},
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for the attention probabilities."},
+    )
+    dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to cache directory for saving and loading datasets"},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " training examples to this value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " evaluation examples to this value if set."
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": ("The name of the dataset column containing the audio data. Defaults to 'audio'")},
+    )
+    train_text_column_name: str = field(
+        default="whisper_transcript",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'whisper_transcript'which is the pseudo-labelled Whisper"
+                " transcription data."
+            )
+        },
+    )
+    eval_text_column_name: str = field(
+        default="text",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'text', which is the original text data"
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=30.0,
+        metadata={"help": ("Filter audio files that are longer than `max_duration_in_seconds` seconds")},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": ("Filter audio files that are shorter than `min_duration_in_seconds` seconds")},
+    )
+    max_label_length: int = field(
+        default=128,
+        metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set will pad the target sequence to a multiple of the provided"
+                " value. This is important to avoid triggering recompilations on TPU."
+                " If unspecified, will default to padding the targets to max length."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is"
+                " especially useful when data preprocessing errors out in distributed"
+                " training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with"
+                " `preprocessing_only=True` so that the cached datasets can"
+                " consequently be loaded in distributed training"
+            )
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the evaluation data set split to use (via the datasets"
+                " library). Defaults to 'validation'"
+            )
+        },
+    )
+    wandb_project: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_name: str = field(
+        default=None,
+        metadata={"help": "The name of the wandb run."},
+    )
+    wandb_job_type: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb job type."},
+    )
+    wandb_dir: str = field(
+        default=None,
+        metadata={"help": "The absolute path to save the wandb logs."},
+    )
+    save_code_to_wandb: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to save main script to wandb. This is valuable for improving"
+                " experiment reproducibility and to diff code across experiments in"
+                " the UI."
+            )
+        },
+    )
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to use Datasets' streaming mode to load and the data."},
+    )
+    wer_threshold: float = field(
+        default=None,
+        metadata={
+            "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
+            "WER with the normalised transcriptions."
+        },
+    )
+    prefetch_size: int = field(
+        default=0,
+        metadata={"help": "Number of samples to pre-fetch if using an iterable dataset."},
+    )
+    timestamp_probability: float = field(
+        default=0.5, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
+    )
+    return_timestamps: bool = field(
+        default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
+    )
+    round_timestamps: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to round the timestamp tokens to the nearest tenth of a second."
+            "By default, Whisper predicts timestamps to the nearest hundredth of a second."
+            "Reducing the timestamp precision to one tenth of a second simplifies the timestamp"
+            "prediction task, at the expense of timestamp granularity."
+        },
+    )
+@dataclass
+class FlaxSeq2SeqTrainingArguments(Seq2SeqTrainingArguments):
+    use_scan: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to use `scan_with_axes` over the encoder and decoder blocks. Using scan results "
+                "in faster compile times and more efficient memory use during training, since all of the layers "
+                "in the encoder/decoder are stacked, and we perform a lax.scan over the stacked block to index "
+                "each layer. However, it results in slower inference time due to the overhead of stacking the "
+                "layers this way. Thus, we **always** default to disabling scan for the inference step."
+            )
+        },
+    )
+    freeze_encoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
+                "copied from the teacher model."
+            )
+        },
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
+    )
+    kl_weight: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    mse_weight: Optional[float] = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    precision: Optional[str] = field(
+        default="half_mixed",
+        metadata={
+            "help": (
+                "Precision with which run training, Can be one of `full`, `half_mixed` or `full_mixed`, the latter two"
+                "of which enable *mixed-precision* training. **Note that this only specifies the dtype of the computation "
+                "and optimizer state. It does not influence the dtype of model parameters.** An explanation of the three "
+                "settings is provided below:"
+                "   1. Full precision: forward pass, backward pass and optimiser states all in float32."
+                "   2. Half mixed precision: forward pass in bfloat16, backward pass and optimiser states in float32. This "
+                "   corresponds to setting the dtype argument to bfloat16 when instantiating the model."
+                "   3. Full mixed precision: forward pass, backward pass and optimiser states all in bfloat16. The dtype "
+                "   argument is set to bfloat16 for the forward pass, and the gradients computed with respect to the bfloat16 "
+                "   parameters in the backward pass (giving bfloat16 gradients). The new optimiser states and parameter "
+                "   updates are computed in float32 by upcasting the bfloat16 gradients and optimiser states to float32 "
+                "   prior to the optimiser update step. The optimiser states are returned in float32 (but not saved to "
+                "   memory) and then downcasted to bfloat16 (saved to memory) for the subsequent train step."
+                "For further details, refer to https://github.com/deepmind/optax/discussions/336"
+            )
+        },
+    )
+    compilation_cache: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable the JAX (experimental) compilation cache. The compilation step is *cached* the "
+                "first time it is run. Successive compilation steps for the same function utilise the cache to reduce"
+                "the compilation time."
+            )
+        },
+    )
+    save_train_state: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to save the Flax Train State on each `save_steps` steps. Required if you intend"
+            "to resume training from partial training runs. If False, only the model weights will be saved."
+            "If True, both the model weights and Flax Train state will be saved."
+        },
+    )
+def shift_tokens_right(label_ids: np.array, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift label ids one token to the right.
+    """
+    shifted_label_ids = np.zeros_like(label_ids)
+    shifted_label_ids[:, 1:] = label_ids[:, :-1]
+    shifted_label_ids[:, 0] = decoder_start_token_id
+    return shifted_label_ids
+@flax.struct.dataclass
+class FlaxDataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The start-of-sequence token id of the decoder.
+        decoder_prev_token_id (:obj: `int`)
+            The start-of-prompt token id of the decoder
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+    """
+    processor: Any
+    decoder_start_token_id: int
+    decoder_prev_token_id: int
+    input_padding: Union[bool, str] = "max_length"
+    target_padding: Union[bool, str] = "max_length"
+    max_target_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        model_input_name = self.processor.model_input_names[0]
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {model_input_name: [feature[model_input_name] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            padding=self.input_padding,
+            return_tensors="np",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            return_tensors="np",
+        )
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        labels = labels_batch["input_ids"]
+        if set(np.unique(labels[:, 0])).issubset({self.decoder_start_token_id, self.decoder_prev_token_id}):
+            decoder_input_ids = labels[:, :-1]
+            labels = labels[:, 1:]
+            labels_batch.attention_mask = labels_batch.attention_mask[:, 1:]
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.decoder_start_token_id)
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
+        labels = labels.filled(fill_value=-100)
+        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
+        bos_index = np.argmax(labels == self.decoder_start_token_id, axis=1)
+        prompt_mask = np.arange(labels.shape[1]) < bos_index[:, None]
+        labels = np.where(prompt_mask, -100, labels)
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+def get_data_loader(
+    seed: int,
+    dataset: IterableDataset,
+    batch_size: int,
+    data_collator: FlaxDataCollatorSpeechSeq2SeqWithPadding,
+    shuffle: bool = False,
+    drop_last: bool = True,
+    dataloader_num_workers: int = 0,
+    skip_batches: int = 0,
+    pin_memory: bool = True,
+    prefetch_size: int = 0,
+) -> DataLoader:
+    """
+    Returns batches of size `batch_size` from `dataset`. If `drop_last` is set to `False`, the final batch may be incomplete,
+    and range in size from 1 to `batch_size`. Shuffle batches if `shuffle` is `True`.
+    Args:
+        seed (int): Numpy seed for generating pseudo random numbers. Used if shuffling the dataset.
+        dataset (IterableDataset): streaming dataset from which to load the data.
+        batch_size (int): how many samples per batch to load.
+        data_collator (FlaxDataCollatorSpeechSeq2SeqWithPadding, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a map-style dataset.
+        shuffle (bool, optional): set to `True` to have the batches reshuffled.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        dataloader_num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        skip_batches (int, optional): Efficiently skip the first `skip_batches`.
+        pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
+            into device/CUDA pinned memory before returning them.  If your data elements
+            are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
+            see the example below.
+    """
+    if shuffle:
+        dataset = dataset.shuffle(seed)
+    if skip_batches > 0:
+        dataset = dataset.skip(skip_batches * batch_size)
+    if prefetch_size > 0:
+        dataset = IterableWrapper(dataset)
+        dataset = dataset.prefetch(prefetch_size)
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        drop_last=drop_last,
+        pin_memory=pin_memory,
+        collate_fn=data_collator,
+        num_workers=dataloader_num_workers,
+    )
+    return data_loader
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+            if regex_match is not None and regex_match.groups() is not None:
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def rotate_checkpoints(
+    save_total_limit=None, use_mtime=False, output_dir=None, checkpoint_prefix="checkpoint"
+) -> None:
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(
+        use_mtime=use_mtime, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+    )
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+def to_fp32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def to_bf16(t):
+    return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    max_grad_norm: float
+    def apply_gradients(self, *, grads, to_dtype: to_fp32, **kwargs):
+        """Updates `step`, `params`, `opt_state` and `**kwargs` in return value, clipping the
+        gradients by the maximum grad norm.
+        Note that internally this function calls `.tx.update()` followed by a call
+        to `optax.apply_updates()` to update `params` and `opt_state`.
+        Args:
+          grads: Gradients that have the same pytree structure as `.params`.
+          **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
+        Returns:
+          An updated instance of `self` with `step` incremented by one, `params`
+          and `opt_state` updated by applying `grads`, and additional attributes
+          replaced as specified by `kwargs`.
+        """
+        # clip gradients by global l2 norm
+        casted_max_grad_norm = to_dtype(self.max_grad_norm)
+        g_norm = linear_algebra.global_norm(grads)
+        g_norm = jnp.maximum(casted_max_grad_norm, g_norm)
+        grads = jax.tree_map(lambda t: (t / g_norm) * casted_max_grad_norm, grads)
+        # perform update step in fp32 and subsequently downcast optimizer states if mixed precision training
+        # grads and opt_state in bf16 (need to upcast), params in fp32 (leave as is)
+        updates, new_opt_state = self.tx.update(to_fp32(grads), to_fp32(self.opt_state), self.params)
+        new_params = optax.apply_updates(self.params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=new_params,
+            opt_state=to_dtype(new_opt_state),
+            **kwargs,
+        )
+    @classmethod
+    def create(cls, *, apply_fn, params, tx, to_dtype: to_fp32, **kwargs):
+        """Creates a new instance with `step=0` and initialized `opt_state`."""
+        # downcast optimizer state to bf16 if mixed-precision training
+        opt_state = tx.init(to_dtype(params))
+        return cls(
+            step=0,
+            apply_fn=apply_fn,
+            params=params,
+            tx=tx,
+            opt_state=opt_state,
+            **kwargs,
+        )
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+    def unreplicate(self):
+        return jax_utils.unreplicate(self)
+    def save_state(self, output_dir, save_total_limit=None, checkpoint_prefix="checkpoint"):
+        step = int(jax.device_get(unreplicate(self.step)))
+        serialized_state = to_bytes(self.unreplicate())
+        output_file = Path(os.path.join(output_dir, f"{checkpoint_prefix}-{step}", "train_state.msgpack"))
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        with output_file.open("wb") as f:
+            f.write(serialized_state)
+        logger.info(f"Flax train state saved in {output_file}")
+        rotate_checkpoints(
+            save_total_limit=save_total_limit, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+        )
+def save_hf_weights(
+    student_state: TrainState,
+    student_model: FlaxWhisperForConditionalGeneration,
+    processor: WhisperProcessor,
+    output_dir: str,
+    cur_step: int,
+    total_train_steps: int,
+    use_scan: bool = True,
+    checkpoint_prefix: str = "checkpoint",
+) -> None:
+    # always disable scan in the params / model so that we can load from PyTorch directly - this is a no-op if we're not using scan for training
+    student_state_params = unreplicate(student_state.params)
+    student_state_params = student_model.convert_scan_to_unroll(student_state_params)
+    student_params = jax.device_get(student_state_params)
+    student_model.disable_scan()
+    if cur_step != total_train_steps:
+        output_dir = os.path.join(output_dir, f"{checkpoint_prefix}-{cur_step}")
+        os.makedirs(output_dir, exist_ok=True)
+    student_model.save_pretrained(output_dir, params=student_params)
+    processor.save_pretrained(output_dir)
+    # re-enable scan only if required for training
+    if use_scan:
+        student_model.enable_scan()
+def write_train_metric(summary_writer, train_metrics, train_time, step, logging_steps):
+    summary_writer.scalar("train/time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        steps_arr = np.arange(0, step, logging_steps)[-len(vals) :]
+        tag = f"train/{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, steps_arr[i])
+def write_eval_metric(summary_writer, eval_metrics, step, prefix="eval"):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"{prefix}/{metric_name}", value, step)
+def write_wandb_metric(wandb_logger, metrics, train_time, step, epoch, prefix="train"):
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    wandb_logger.log(log_metrics, step)
+def write_wandb_pred(
+    wandb_logger, pred_str, label_str, norm_pred_str, norm_label_str, cur_step, prefix="eval", num_lines=200000
+):
+    # pretty name for current step: step 50000 -> step 50k
+    cur_step_pretty = f"{int(cur_step // 1000)}k" if cur_step > 1000 else cur_step
+    # convert str data to a wandb compatible format
+    str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data[:num_lines]
+            )
+        },
+        cur_step,
+    )
+    # log incorrect normalised predictions
+    str_data = np.asarray(str_data)
+    str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"incorrect_predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data_incorrect[:num_lines]
+            )
+        },
+        cur_step,
+    )
+def create_learning_rate_fn(
+    num_train_steps: int, lr_scheduler_type: str, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    lr_scheduler_types = ("linear", "constant_with_warmup")
+    if lr_scheduler_type not in lr_scheduler_types:
+        raise ValueError(
+            f"lr_scheduler_type of type {lr_scheduler_type} not supported, choose from {lr_scheduler_types}."
+        )
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate,
+        end_value=0 if lr_scheduler_type == "linear" else learning_rate,
+        transition_steps=num_train_steps - num_warmup_steps,
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    splits=None,
+    text_column_names=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        # we assume that all the datasets we're using derive from the distil-whisper org on the Hub - prepend the org name if necessary
+        for i in range(len(dataset_names)):
+            ds_name = dataset_names[i]
+            dataset_names[i] = f"distil-whisper/{ds_name}" if "/" not in ds_name else ds_name
+        dataset_config_names = dataset_config_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        text_column_names = text_column_names.split("+") if text_column_names is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+    if text_column_names is not None and len(text_column_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(text_column_names)} text column names."
+        )
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+    text_column_names = (
+        text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
+    )
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "text_column_name": text_column_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+def load_multiple_datasets(
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    text_column_names: Optional[List] = None,
+    sampling_rate: Optional[int] = 16000,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: bool = True,
+    seed: int = None,
+    **kwargs,
+) -> IterableDataset:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
+    )
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+    if len(dataset_names_dict) == 1:
+        dataset_dict = dataset_names_dict[0]
+        # we have a single dataset so just return it as is
+        return load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        dataset = load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+        # resample to specified sampling rate
+        dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
+        dataset = dataset.remove_columns(
+            set(dataset.features.keys()) - {"audio", dataset_dict["text_column_name"], "whisper_transcript"}
+        )
+        all_datasets.append(dataset)
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        interleaved_dataset = concatenate_datasets(all_datasets)
+    return interleaved_dataset
+def get_layers_to_supervise(student_layers: int, teacher_layers: int) -> dict:
+    """Helper function to map the student layer i to the teacher layer j whose output we'd like them to emulate. Used
+    for MSE loss terms in distillation (hidden-states and activations). Student layers are paired with teacher layers
+    in equal increments, e.g. for a 12-layer model distilled to a 3-layer model, student layer 0 emulates teacher layer
+    3 (such that it behaves like the first 4 teacher layers), student layer 1 emulates teacher layer 7, and student layer
+    2 emulates teacher layer 11. This mapping is summarised by the dictionary: {0: 3, 1: 7, 2: 11}, which is precisely
+    the output of this function for the arguments (student_layers=3, teacher_layers=12)."""
+    layer_intervals = np.linspace(teacher_layers // student_layers - 1, teacher_layers - 1, student_layers, dtype=int)
+    layer_intervals[-1] = teacher_layers - 1
+    layer_map = {}
+    for student_layer, teacher_layer in enumerate(layer_intervals):
+        layer_map[student_layer] = teacher_layer
+    return layer_map
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FlaxSeq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your JAX/Flax versions.
+    send_example_telemetry("run_flax_speech_recognition_seq2seq", model_args, data_args, framework="flax")
+    # 2. Define remote logging - do this early so that we get the full traceback on our remote logs
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard:
+        if jax.process_index() == 0:
+            try:
+                from flax.metrics.tensorboard import SummaryWriter
+                summary_writer = SummaryWriter(log_dir=os.path.join(Path(training_args.output_dir), "runs"))
+            except ImportError as ie:
+                has_tensorboard = False
+                logger.warning(
+                    "Unable to display metrics through TensorBoard because some package" f" are not installed: {ie}"
+                )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not"
+            " installed: Please run `pip install tensorboard` to enable."
+        )
+    # Enable wandb only on the master node
+    has_wandb = is_wandb_available()
+    if has_wandb:
+        import wandb as wandb_logger
+        # Set up wandb run
+        if jax.process_index() == 0:
+            wandb_logger.init(
+                project=data_args.wandb_project,
+                name=data_args.wandb_name,
+                job_type=data_args.wandb_job_type,
+                dir=data_args.wandb_dir,
+                save_code=data_args.save_code_to_wandb,
+            )
+    else:
+        logger.warning("Wandb logging requires wandb to be installed. Run `pip install wandb` to enable.")
+    # 3. Setup local logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Set the verbosity to info of the Transformers logger.
+    # We only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Check the output dir is valid
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not"
+            " empty. Use `--overwrite_output_dir` to overcome."
+        )
+    # 4. Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name,
+                token=training_args.hub_token,
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+        repo = Repository(
+            training_args.output_dir,
+            clone_from=repo_name,
+            token=training_args.hub_token,
+        )
+    if training_args.compilation_cache:
+        cc.initialize_cache(os.path.join(model_args.cache_dir, "jax_cache"))
+    # 5. Load dataset
+    raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # set seed for determinism
+    set_seed(training_args.seed)
+    if training_args.do_train:
+        raw_datasets["train"] = load_multiple_datasets(
+            data_args.train_dataset_name,
+            data_args.train_dataset_config_name,
+            splits=data_args.train_split_name,
+            streaming=data_args.streaming,
+            dataset_samples=data_args.train_dataset_samples,
+            seed=training_args.seed,
+            cache_dir=data_args.dataset_cache_dir,
+            token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_eval:
+        dataset_names_dict = convert_dataset_str_to_list(
+            data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+            (
+                data_args.eval_dataset_config_name
+                if data_args.eval_dataset_config_name
+                else data_args.train_dataset_config_name
+            ),
+            splits=data_args.eval_split_name,
+            text_column_names=data_args.eval_text_column_name,
+        )
+        all_eval_splits = []
+        if len(dataset_names_dict) == 1:
+            # load a single eval set
+            dataset_dict = dataset_names_dict[0]
+            all_eval_splits.append("eval")
+            raw_datasets["eval"] = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                cache_dir=data_args.dataset_cache_dir,
+                token=True if model_args.use_auth_token else None,
+                streaming=data_args.streaming,
+            )
+        else:
+            # load multiple eval sets
+            for dataset_dict in dataset_names_dict:
+                if dataset_dict["name"] == "esb/diagnostic-dataset":
+                    # for the ESB diagnostic dataset, the dataset name is effectively the config
+                    pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
+                else:
+                    pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
+                all_eval_splits.append(pretty_name)
+                raw_datasets[pretty_name] = load_dataset(
+                    dataset_dict["name"],
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    cache_dir=data_args.dataset_cache_dir,
+                    token=True if model_args.use_auth_token else None,
+                    streaming=data_args.streaming,
+                )
+                features = raw_datasets[pretty_name].features.keys()
+                if "text" not in features:
+                    raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
+                        dataset_dict["text_column_name"], "text"
+                    )
+                raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
+                    set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
+                )
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+    raw_datasets_train_features = list(raw_datasets["train"].features.keys())
+    if data_args.audio_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--audio_column_name` to"
+            " the correct audio column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    if data_args.train_text_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--train_text_column_name {data_args.train_text_column_name} not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--train_text_column_name` to the"
+            " correct text column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    # 6. Load pretrained model, tokenizer, and feature extractor
+    config = WhisperConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(
+        (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = WhisperTokenizerFast.from_pretrained(
+        (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    # override timestamp tokens until tokenizer issues are fixed in transformers
+    timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
+    tokenizer.add_tokens(timestamps)
+    config.update(
+        {
+            "activation_dropout": model_args.activation_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "dropout": model_args.dropout,
+        }
+    )
+    if training_args.precision == "full_mixed":
+        # forward pass, backward pass and optimiser states in bf16
+        dtype = jnp.bfloat16
+        to_dtype = to_bf16
+    elif training_args.precision == "half_mixed" or model_args.dtype == "bfloat16":
+        # forward pass in bf16, backward pass and optimiser states in fp32
+        dtype = jnp.bfloat16
+        to_dtype = to_fp32
+    else:
+        if training_args.precision != "full":
+            raise ValueError(
+                f"`precision` should be one of: `full`, `half_mixed` or `full_mixed`, got {training_args.precision}"
+            )
+        # forward pass, backward pass and optimiser states in fp32
+        dtype = jnp.float32
+        to_dtype = to_fp32
+    student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        subfolder=model_args.subfolder,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+        use_scan=model_args.load_with_scan_weights,
+    )
+    teacher_model, teacher_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.teacher_model_name_or_path,
+        # config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        # revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+    )
+    if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
+        raise ValueError(
+            f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
+            f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
+            f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
+        )
+    # enable scan / gradient checkpointing if necessary
+    if training_args.use_scan:
+        student_model.enable_scan()  # to enable scan in the nn.Module
+        student_params = student_model.convert_unroll_to_scan(student_params)  # to convert the unrolled params to scan
+        teacher_model.enable_scan()  # faster compile time (even though we don't train the teacher)
+        teacher_params = teacher_model.convert_unroll_to_scan(teacher_params)
+    if training_args.gradient_checkpointing:
+        student_model.enable_gradient_checkpointing()  # to enable checkpointing in the nn.Module, there is no change to the params structure
+        teacher_model.enable_gradient_checkpointing()
+    if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
+        # We need to set the language and task ids for previously multilingual checkpoints - for now we hardcode this to Norwegian
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=False)
+        student_model.generation_config.update(
+            **{
+                "language": "<|no|>",
+                "task": "transcribe",
+            }
+        )
+    # 7. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name,
+        datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
+    )
+    # 8. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    )
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    dataloader_num_workers = training_args.dataloader_num_workers
+    dataloader_prefetch_size = data_args.prefetch_size
+    train_text_column_name = data_args.train_text_column_name
+    eval_text_column_name = "text"
+    model_input_name = feature_extractor.model_input_names[0]
+    normalizer = BasicTextNormalizer(tokenizer.english_spelling_normalizer)
+    wer_threshold = data_args.wer_threshold
+    round_timestamps = data_args.round_timestamps
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = (
+            raw_datasets["train"].take(data_args.max_train_samples)
+            if data_args.streaming
+            else raw_datasets["train"].select(range(data_args.max_train_samples))
+        )
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        for eval_split in all_eval_splits:
+            raw_datasets[eval_split] = (
+                raw_datasets[eval_split].take(data_args.max_eval_samples)
+                if data_args.streaming
+                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+            )
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    def is_wer_in_range(ground_truth, whisper_transcript):
+        norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
+            norm_whisper_transcript = normalizer(whisper_transcript)
+            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+            return wer < wer_threshold
+        else:
+            # filter automatically since we cant know WER
+            return False
+    filter_by_wer_threshold = partial(
+        raw_datasets["train"].filter,
+        function=is_wer_in_range,
+        input_columns=[eval_text_column_name, train_text_column_name],
+    )
+    if wer_threshold is not None:
+        raw_datasets["train"] = (
+            filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+            if not data_args.streaming
+            else filter_by_wer_threshold()
+        )
+    def has_timestamp_tokens(input_str):
+        """
+        Identify whether the input string contains timestamp tokens, of the form <|0.00|>, by searching for
+        pairs of left and right-angle brackets.
+        """
+        return bool(re.search("\<[^\>]*\>", input_str))
+    def round_timestamp_tokens(input_str: str, ndigits: int = 1):
+        timestamps = re.findall("\<[^\>]*\>", input_str, re.DOTALL)
+        for token in timestamps:
+            # extract time digits from timestamp token, e.g. <|6.24|> to 6.24
+            time_digit = token[2:-2]
+            # round to specified number of digits, e.g. 6.24 to 6.2
+            time_digit = round(float(time_digit), ndigits=ndigits)
+            # replace in original string with the same precision, e.g. <|6.24|> to <|6.20|>
+            input_str = input_str.replace(token, "<|{:.2f}|>".format(time_digit))
+        return input_str
+    def prepare_train_dataset(batch):
+        # process audio input
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process text targets
+        input_str = batch[train_text_column_name]
+        # prompt & timestamp processing: for now, we only do one or the other
+        if input_str.startswith("<|startoftranscript|>") or input_str.startswith("<|startofprev|>"):
+            # prompted target text already has special ids added, so don't add them here
+            batch["labels"] = tokenizer(input_str, add_special_tokens=False).input_ids
+            return batch
+        has_timestamps = has_timestamp_tokens(input_str)
+        if has_timestamps:
+            predict_timestamps = bool(np.random.binomial(1, data_args.timestamp_probability))
+            if not predict_timestamps:
+                # filter timestamp token ids if not part of the prediction task
+                input_str = tokenizer._filter_timestamp_ids(input_str)
+            elif round_timestamps:
+                input_str = round_timestamp_tokens(input_str)
+        else:
+            predict_timestamps = False
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=predict_timestamps)
+        input_ids = tokenizer(input_str).input_ids
+        batch["labels"] = input_ids
+        return batch
+    def prepare_eval_dataset(batch):
+        # process audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        # process audio length
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process targets
+        input_str = batch[eval_text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    if training_args.do_train:
+        map_fn_train = partial(
+            raw_datasets["train"].map, function=prepare_train_dataset, remove_columns=raw_datasets_train_features
+        )
+        vectorized_datasets["train"] = (
+            map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+            if not data_args.streaming
+            else map_fn_train()
+        )
+    if training_args.do_eval:
+        for eval_split in all_eval_splits:
+            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial(
+                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+            )
+            vectorized_datasets[eval_split] = (
+                map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+                if not data_args.streaming
+                else map_fn_eval()
+            )
+    # filter training data with inputs longer than max_input_length
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    filter_by_audio_fn = partial(
+        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    )
+    vectorized_datasets = (
+        filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+        if not data_args.streaming
+        else filter_by_audio_fn()
+    )
+    # filter training data with labels longer than max_label_length
+    def is_labels_in_length_range(labels):
+        return 0 < len(labels) < max_label_length
+    filter_by_labels_fn = partial(
+        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    )
+    vectorized_datasets = (
+        filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+        if not data_args.streaming
+        else filter_by_labels_fn()
+    )
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 8. Load Metric
+    metric = evaluate.load("wer")
+    # convention is that we space all punctuation *except* apostrophes
+    all_punctuation = list(string.punctuation.replace("'", ""))
+    return_timestamps = data_args.return_timestamps if data_args.timestamp_probability > 0 else False
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # space punctuation for orthographic WER (c.f. ESB paper https://arxiv.org/abs/2210.13352)
+        spaced_pred_str = [
+            pred_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(pred_str))
+        ]
+        spaced_label_str = [
+            label_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(label_str))
+        ]
+        wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
+        norm_pred_str, norm_label_str = [], []
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nospeech|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nospeech|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nospeech|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nospeech|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nospeech|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # 9. Save feature extractor, tokenizer, config and generation config
+    feature_extractor.save_pretrained(training_args.output_dir)
+    tokenizer.save_pretrained(training_args.output_dir)
+    config.save_pretrained(training_args.output_dir)
+    student_model.generation_config.save_pretrained(
+        training_args.output_dir
+    )  # generation config stays bound to model to make it easy to jit
+    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=student_model.config.decoder_start_token_id,  # <|startoftranscript|>
+        decoder_prev_token_id=tokenizer.all_special_ids[-3],  # <|startofprev|>
+        input_padding="longest",
+        target_padding="max_length",
+        max_target_length=max_label_length,
+    )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constants
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    eval_batch_size = per_device_eval_batch_size * jax.device_count()
+    if not data_args.streaming and training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // train_batch_size
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+    else:
+        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    if training_args.eval_steps is None:
+        logger.info(
+            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+        )
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        total_train_steps * gradient_accumulation_steps,
+        training_args.lr_scheduler_type,
+        training_args.warmup_steps * gradient_accumulation_steps,
+        training_args.learning_rate,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        # find out all LayerNorm parameters
+        layer_norm_candidates = [
+            "layer_norm",
+            "self_attn_layer_norm",
+            "final_layer_norm",
+            "encoder_attn_layer_norm",
+        ]
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
+        flat_mask = {path: path[-1] != "bias" and path[-2:] not in layer_norm_named_params for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    if gradient_accumulation_steps > 1:
+        # accumulate gradients and apply once every k steps
+        adamw = optax.MultiSteps(adamw, every_k_schedule=gradient_accumulation_steps)
+    share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
+    encoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.encoder_layers, teacher_model.config.encoder_layers
+    )
+    decoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.decoder_layers, teacher_model.config.decoder_layers
+    )
+    # Setup train state
+    student_state = TrainState.create(
+        apply_fn=student_model.decode if share_hidden_states else student_model.__call__,
+        params=student_params,
+        tx=adamw,
+        to_dtype=to_dtype,
+        dropout_rng=dropout_rng,
+        max_grad_norm=training_args.max_grad_norm,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        if os.path.isfile(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")):
+            logger.info(
+                f"Checkpoint detected, resuming training at {training_args.resume_from_checkpoint}. To avoid "
+                "this behavior, omit the resume_from_checkpoint argument."
+            )
+            with Path(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")).open("rb") as f:
+                student_state = from_bytes(student_state, f.read())
+        else:
+            logger.warning(
+                f"Checkpoint {training_args.resume_from_checkpoint} not detected, training from scratch. Ensure "
+                f"you pass the path to a folder with a valid checkpoint for your model."
+            )
+    def cross_entropy_loss(logits, labels):
+        vocab_size = logits.shape[-1]
+        # optax onehot always returns a float32 device array, need to downcast if performing mixed precision training
+        onehot_targets = to_dtype(onehot(labels, vocab_size))
+        loss = optax.softmax_cross_entropy(logits, onehot_targets)
+        # ignore padded tokens from loss, i.e. where labels are not set to -100
+        padding = labels >= 0
+        loss = loss * padding
+        loss = loss.sum()
+        num_labels = padding.sum()
+        return loss, num_labels
+    # temperature smoothed kl-divergence
+    def kl_divergence(target_distribution, log_predicted_distribution, labels, eps=1e-20):
+        divergence = -target_distribution * (log_predicted_distribution - jnp.log(target_distribution + eps))
+        # ignore padded tokens from divergence, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        padding_mask = jnp.expand_dims(padding_mask, axis=-1)
+        divergence = (divergence * padding_mask).sum()
+        return to_dtype(divergence)  # respect the dtype of the backprop
+    def mean_square_error_loss(student_outputs, teacher_outputs):
+        mse = dtype(0.0)
+        # tie encoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.encoder_hidden_states[0] - student_outputs.encoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in encoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.encoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.encoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.encoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.encoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+        # tie decoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.decoder_hidden_states[0] - student_outputs.decoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in decoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.decoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.decoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.decoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.decoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+            # student_cross_attention = student_outputs.cross_attentions[student_layer_id]
+            # teacher_cross_attention = teacher_outputs.cross_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_cross_attention - teacher_cross_attention))
+        return to_dtype(mse)  # respect the dtype of the backprop
+    # Define gradient update step fn
+    def train_step(
+        student_state,
+        teacher_params,
+        batch,
+        freeze_encoder,
+        share_hidden_states,
+        temperature=2.0,
+    ):
+        dropout_rng, new_dropout_rng = jax.random.split(student_state.dropout_rng)
+        def compute_loss(student_params):
+            labels = batch.pop("labels")
+            output_hidden_states = not share_hidden_states and training_args.mse_weight > 0.0
+            teacher_outputs = teacher_model(
+                **batch,
+                params=teacher_params,
+                freeze_encoder=True,
+                output_hidden_states=output_hidden_states,
+                train=False,
+            )
+            if share_hidden_states:
+                # if the student and teacher share the same frozen encoder then we don't have to recompute the
+                # encoder hidden-states for the student model, we can just re-use from the teacher
+                encoder_hidden_states = jax.lax.stop_gradient(teacher_outputs.encoder_last_hidden_state)
+                encoder_outputs = FlaxBaseModelOutput(last_hidden_state=encoder_hidden_states)
+                student_outputs = student_state.apply_fn(
+                    decoder_input_ids=batch["decoder_input_ids"],
+                    encoder_outputs=encoder_outputs,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    train=True,
+                )
+            else:
+                # do the full forward pass for the student model (encoder + decoder)
+                student_outputs = student_state.apply_fn(
+                    **batch,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    freeze_encoder=freeze_encoder,
+                    output_hidden_states=output_hidden_states,
+                    train=True,
+                )
+            # CE (data) loss
+            ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+            # rescale by temperature to ensure gradients scale correctly
+            teacher_distribution = jax.nn.softmax(teacher_outputs.logits / temperature, axis=-1)
+            # ensure no information flow backwards through teacher
+            teacher_distribution = jax.lax.stop_gradient(teacher_distribution)
+            # log softmax of student predictions for numerical stability
+            student_distribution = jax.nn.log_softmax(student_outputs.logits / temperature, axis=-1)
+            # KL-divergence loss (scaled by temperature)
+            kl_loss = kl_divergence(teacher_distribution, student_distribution, labels) * temperature**2
+            # MSE loss between enc-dec hidden-states and attentions
+            mse_loss = (
+                mean_square_error_loss(student_outputs, teacher_outputs)
+                if output_hidden_states
+                else jnp.zeros_like(kl_loss)
+            )
+            # use DistilBart formulation - only tune the MSE weight and take remaining HPs from DistilBERT
+            ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+            loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+            return loss, (
+                ce_loss,
+                kl_loss,
+                mse_loss,
+                num_labels,
+            )
+        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
+        (loss, (ce_loss, kl_loss, mse_loss, num_labels)), grad = grad_fn(to_dtype(student_state.params))
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # true grad = total grad / total samples
+        grad = jax.lax.psum(grad, "batch")
+        grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
+        new_state = student_state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng, to_dtype=to_dtype)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {
+            "loss": loss,
+            "learning_rate": linear_decay_lr_schedule_fn(student_state.step),
+            "ce_loss": ce_loss,
+            "kl_loss": kl_loss,
+            "mse_loss": mse_loss,
+        }
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(student_params, teacher_params, batch):
+        labels = batch.pop("labels")
+        output_hidden_states = not share_hidden_states and training_args.mse_weight > 0
+        student_outputs = student_model(
+            **batch,
+            params=student_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        student_distribution = jax.nn.log_softmax(student_outputs.logits, axis=-1)
+        ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+        teacher_outputs = teacher_model(
+            **batch,
+            params=teacher_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        teacher_distribution = jax.nn.softmax(teacher_outputs.logits, axis=-1)
+        # temperature is always 1 for eval
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, labels)
+        mse_loss = (
+            mean_square_error_loss(student_outputs, teacher_outputs)
+            if output_hidden_states
+            else jnp.zeros_like(kl_loss)
+        )
+        ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+        loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss, "mse_loss": mse_loss}
+        return metrics
+    # Define generation function
+    num_beams = (
+        training_args.generation_num_beams
+        if training_args.generation_num_beams is not None
+        else student_model.config.num_beams
+    )
+    # forcing the language and task tokens helps the model in its generations
+    gen_kwargs = {
+        "max_length": max_label_length,
+        "num_beams": num_beams,
+        "language": "<|en|>",
+        "task": "transcribe",
+        "return_timestamps": return_timestamps,
+    }
+    def generate_step(student_params, batch):
+        output_ids = student_model.generate(
+            batch[model_input_name],
+            attention_mask=batch.get("attention_mask"),
+            params=student_params,
+            **gen_kwargs,
+        )
+        return output_ids.sequences
+    # Replicate the train state on each device
+    student_state = student_state.replicate()
+    # Replicate the teacher params on each device
+    teacher_params = jax_utils.replicate(teacher_params)
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(
+        train_step,
+        "batch",
+        in_axes=(0, 0, 0, None, None, None),
+        donate_argnums=(0,),
+        static_broadcasted_argnums=(
+            3,
+            4,
+        ),
+    )
+    p_eval_step = jax.pmap(eval_step, "batch")
+    p_generate_step = jax.pmap(generate_step, "batch")
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    train_metrics = []
+    batches_to_skip = jax.device_get(unreplicate(student_state.step))
+    cur_step = int(batches_to_skip)  # will be zero if starting from scratch
+    epochs_trained = batches_to_skip // steps_per_epoch
+    steps_trained_progress_bar = tqdm(range(total_train_steps), desc="Train steps ... ", position=0)
+    steps_trained_progress_bar.update(batches_to_skip)
+    continue_training = True
+    minibatch_steps = 0
+    if batches_to_skip > 0:
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {batches_to_skip}")
+    # Generate a training data loader by shuffling sampling indices from the train dataset
+    train_loader = get_data_loader(
+        training_args.seed,
+        vectorized_datasets["train"],
+        batch_size=train_batch_size,
+        data_collator=data_collator,
+        dataloader_num_workers=dataloader_num_workers,
+        skip_batches=batches_to_skip,
+        prefetch_size=dataloader_prefetch_size,
+    )
+    for epoch in range(epochs_trained, num_epochs):
+        if hasattr(train_loader, "dataset") and isinstance(train_loader.dataset, IterableDataset):
+            train_loader.dataset.set_epoch(epoch)
+        for batch in train_loader:
+            minibatch_steps += 1
+            update_step = minibatch_steps == gradient_accumulation_steps
+            if update_step:
+                steps_trained_progress_bar.update(1)
+                cur_step += 1
+                minibatch_steps = 0
+            batch = shard(batch.data)
+            student_state, train_metric = p_train_step(
+                student_state,
+                teacher_params,
+                batch,
+                training_args.freeze_encoder,
+                share_hidden_states,
+                training_args.temperature,
+            )
+            if cur_step % training_args.logging_steps == 0 and update_step:
+                train_metrics.append(train_metric)
+                train_metric_to_write = unreplicate(train_metric)
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric_to_write['loss']}, Learning Rate:"
+                    f" {train_metric_to_write['learning_rate']})"
+                )
+                if has_wandb and jax.process_index() == 0:
+                    write_wandb_metric(
+                        wandb_logger,
+                        train_metric_to_write,
+                        train_time + time.time() - train_start,
+                        cur_step,
+                        epoch,
+                        prefix="train",
+                    )
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0 and update_step) or cur_step == total_train_steps:
+                if jax.process_index() == 0:
+                    save_hf_weights(
+                        student_state,
+                        student_model,
+                        processor,
+                        training_args.output_dir,
+                        cur_step,
+                        total_train_steps,
+                        use_scan=training_args.use_scan,
+                    )
+                    if training_args.save_train_state:
+                        student_state.save_state(
+                            training_args.output_dir, save_total_limit=training_args.save_total_limit
+                        )
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(
+                            commit_message=f"Saving train state of step {cur_step}",
+                            blocking=False,
+                        )
+            if training_args.do_eval and (
+                (cur_step % eval_steps == 0 and update_step) or cur_step == total_train_steps
+            ):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                for eval_split in all_eval_splits:
+                    eval_metrics = []
+                    eval_preds = []
+                    eval_labels = []
+                    eval_start = time.time()
+                    eval_loader = get_data_loader(
+                        training_args.seed,
+                        vectorized_datasets[eval_split],
+                        batch_size=eval_batch_size,
+                        data_collator=data_collator,
+                        shuffle=False,
+                        drop_last=False,
+                        dataloader_num_workers=dataloader_num_workers,
+                    )
+                    for batch in tqdm(eval_loader, desc=f"Evaluating {eval_split}...", position=2):
+                        # Model forward
+                        labels = batch["labels"]
+                        metrics = pad_shard_unpad(
+                            p_eval_step,
+                            static_argnums=(
+                                0,
+                                1,
+                            ),
+                            static_return=True,
+                        )(
+                            student_state.params,
+                            teacher_params,
+                            batch.data,
+                            min_device_batch=per_device_eval_batch_size,
+                        )
+                        eval_metrics.append(metrics)
+                        # generation
+                        if training_args.predict_with_generate:
+                            generated_ids = pad_shard_unpad(p_generate_step)(
+                                student_state.params, batch.data, min_device_batch=per_device_eval_batch_size
+                            )
+                            eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                            eval_labels.extend(labels)
+                    eval_time = time.time() - eval_start
+                    # normalize eval metrics
+                    eval_metrics = get_metrics(eval_metrics)
+                    eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
+                    # compute WER metric
+                    wer_desc = ""
+                    if training_args.predict_with_generate:
+                        wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+                            eval_preds, eval_labels
+                        )
+                        eval_metrics.update(wer_metric)
+                        wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+                    # Print metrics and update progress bar
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {wer_desc})"
+                    )
+                    if has_tensorboard and jax.process_index() == 0:
+                        write_eval_metric(
+                            summary_writer,
+                            eval_metrics,
+                            cur_step,
+                            prefix=eval_split,
+                        )
+                    if has_wandb and jax.process_index() == 0:
+                        write_wandb_metric(wandb_logger, eval_metrics, eval_time, cur_step, epoch, prefix=eval_split)
+                        if training_args.predict_with_generate:
+                            write_wandb_pred(
+                                wandb_logger,
+                                pred_str,
+                                label_str,
+                                norm_pred_str,
+                                norm_label_str,
+                                cur_step,
+                                prefix=eval_split,
+                            )
+                if has_tensorboard and jax.process_index() == 0:
+                    # we'll only log to tensorboard every eval steps
+                    write_train_metric(
+                        summary_writer,
+                        train_metrics,
+                        train_time,
+                        cur_step,
+                        training_args.logging_steps,
+                    )
+                # flush the train metrics
+                train_start = time.time()
+                train_metrics = []
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+        if not continue_training:
+            break
+if __name__ == "__main__":
+    main()

run_distillation_debug.py ADDED Viewed

	@@ -0,0 +1,2162 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
+"""
+# You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
+import logging
+import os
+import re
+import shutil
+import string
+import sys
+import time
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+import datasets
+import evaluate
+import flax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import transformers
+from datasets import (
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    interleave_datasets,
+    load_dataset,
+)
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.serialization import from_bytes, to_bytes
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository, create_repo
+from jax.experimental.compilation_cache import compilation_cache as cc
+from optax._src import linear_algebra
+from torch.utils.data import DataLoader
+from torchdata.datapipes.iter import IterableWrapper
+from tqdm import tqdm
+from transformers import (
+    AddedToken,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperProcessor,
+    WhisperTokenizerFast,
+    is_tensorboard_available,
+    is_wandb_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer,EnglishTextNormalizer
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from distil_whisper import FlaxWhisperForConditionalGeneration
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.27.0.dev0")
+require_version(
+    "datasets>=1.18.0",
+    "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt",
+)
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained student model or model identifier from huggingface.co/models")}
+    )
+    teacher_model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained teacher model or model identifier from huggingface.co/models")}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "feature extractor name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": ("Where to store the pretrained models downloaded from huggingface.co")},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": ("Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.")},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": ("The specific model version to use (can be a branch name, tag name or commit id).")},
+    )
+    subfolder: str = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `transformers-cli login`"
+                " (necessary to use this script with private models)."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "Floating-point format in which the model weights should be initialized"
+                " and trained. Choose one of `[float32, float16, bfloat16]`."
+            )
+        },
+    )
+    load_with_scan_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the pre-trained checkpoint has its weights stored in scan format. Set to True for scanned "
+            "weights, defaults to False for non-scan (unrolled) weights."
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for activations inside the fully connected layer."},
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for the attention probabilities."},
+    )
+    dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to cache directory for saving and loading datasets"},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " training examples to this value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " evaluation examples to this value if set."
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": ("The name of the dataset column containing the audio data. Defaults to 'audio'")},
+    )
+    train_text_column_name: str = field(
+        default="whisper_transcript",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'whisper_transcript'which is the pseudo-labelled Whisper"
+                " transcription data."
+            )
+        },
+    )
+    eval_text_column_name: str = field(
+        default="text",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'text', which is the original text data"
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=30.0,
+        metadata={"help": ("Filter audio files that are longer than `max_duration_in_seconds` seconds")},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": ("Filter audio files that are shorter than `min_duration_in_seconds` seconds")},
+    )
+    max_label_length: int = field(
+        default=128,
+        metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set will pad the target sequence to a multiple of the provided"
+                " value. This is important to avoid triggering recompilations on TPU."
+                " If unspecified, will default to padding the targets to max length."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is"
+                " especially useful when data preprocessing errors out in distributed"
+                " training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with"
+                " `preprocessing_only=True` so that the cached datasets can"
+                " consequently be loaded in distributed training"
+            )
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the evaluation data set split to use (via the datasets"
+                " library). Defaults to 'validation'"
+            )
+        },
+    )
+    wandb_project: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_name: str = field(
+        default=None,
+        metadata={"help": "The name of the wandb run."},
+    )
+    wandb_job_type: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb job type."},
+    )
+    wandb_dir: str = field(
+        default=None,
+        metadata={"help": "The absolute path to save the wandb logs."},
+    )
+    save_code_to_wandb: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to save main script to wandb. This is valuable for improving"
+                " experiment reproducibility and to diff code across experiments in"
+                " the UI."
+            )
+        },
+    )
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to use Datasets' streaming mode to load and the data."},
+    )
+    wer_threshold: float = field(
+        default=None,
+        metadata={
+            "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
+            "WER with the normalised transcriptions."
+        },
+    )
+    prefetch_size: int = field(
+        default=0,
+        metadata={"help": "Number of samples to pre-fetch if using an iterable dataset."},
+    )
+    timestamp_probability: float = field(
+        default=0.5, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
+    )
+    return_timestamps: bool = field(
+        default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
+    )
+    round_timestamps: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to round the timestamp tokens to the nearest tenth of a second."
+            "By default, Whisper predicts timestamps to the nearest hundredth of a second."
+            "Reducing the timestamp precision to one tenth of a second simplifies the timestamp"
+            "prediction task, at the expense of timestamp granularity."
+        },
+    )
+@dataclass
+class FlaxSeq2SeqTrainingArguments(Seq2SeqTrainingArguments):
+    use_scan: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to use `scan_with_axes` over the encoder and decoder blocks. Using scan results "
+                "in faster compile times and more efficient memory use during training, since all of the layers "
+                "in the encoder/decoder are stacked, and we perform a lax.scan over the stacked block to index "
+                "each layer. However, it results in slower inference time due to the overhead of stacking the "
+                "layers this way. Thus, we **always** default to disabling scan for the inference step."
+            )
+        },
+    )
+    freeze_encoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
+                "copied from the teacher model."
+            )
+        },
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
+    )
+    kl_weight: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    mse_weight: Optional[float] = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    precision: Optional[str] = field(
+        default="half_mixed",
+        metadata={
+            "help": (
+                "Precision with which run training, Can be one of `full`, `half_mixed` or `full_mixed`, the latter two"
+                "of which enable *mixed-precision* training. **Note that this only specifies the dtype of the computation "
+                "and optimizer state. It does not influence the dtype of model parameters.** An explanation of the three "
+                "settings is provided below:"
+                "   1. Full precision: forward pass, backward pass and optimiser states all in float32."
+                "   2. Half mixed precision: forward pass in bfloat16, backward pass and optimiser states in float32. This "
+                "   corresponds to setting the dtype argument to bfloat16 when instantiating the model."
+                "   3. Full mixed precision: forward pass, backward pass and optimiser states all in bfloat16. The dtype "
+                "   argument is set to bfloat16 for the forward pass, and the gradients computed with respect to the bfloat16 "
+                "   parameters in the backward pass (giving bfloat16 gradients). The new optimiser states and parameter "
+                "   updates are computed in float32 by upcasting the bfloat16 gradients and optimiser states to float32 "
+                "   prior to the optimiser update step. The optimiser states are returned in float32 (but not saved to "
+                "   memory) and then downcasted to bfloat16 (saved to memory) for the subsequent train step."
+                "For further details, refer to https://github.com/deepmind/optax/discussions/336"
+            )
+        },
+    )
+    compilation_cache: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable the JAX (experimental) compilation cache. The compilation step is *cached* the "
+                "first time it is run. Successive compilation steps for the same function utilise the cache to reduce"
+                "the compilation time."
+            )
+        },
+    )
+    save_train_state: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to save the Flax Train State on each `save_steps` steps. Required if you intend"
+            "to resume training from partial training runs. If False, only the model weights will be saved."
+            "If True, both the model weights and Flax Train state will be saved."
+        },
+    )
+def shift_tokens_right(label_ids: np.array, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift label ids one token to the right.
+    """
+    shifted_label_ids = np.zeros_like(label_ids)
+    shifted_label_ids[:, 1:] = label_ids[:, :-1]
+    shifted_label_ids[:, 0] = decoder_start_token_id
+    return shifted_label_ids
+@flax.struct.dataclass
+class FlaxDataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The start-of-sequence token id of the decoder.
+        decoder_prev_token_id (:obj: `int`)
+            The start-of-prompt token id of the decoder
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+    """
+    processor: Any
+    decoder_start_token_id: int
+    decoder_prev_token_id: int
+    input_padding: Union[bool, str] = "max_length"
+    target_padding: Union[bool, str] = "max_length"
+    max_target_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        model_input_name = self.processor.model_input_names[0]
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {model_input_name: [feature[model_input_name] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            padding=self.input_padding,
+            return_tensors="np",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            return_tensors="np",
+        )
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        labels = labels_batch["input_ids"]
+        if set(np.unique(labels[:, 0])).issubset({self.decoder_start_token_id, self.decoder_prev_token_id}):
+            decoder_input_ids = labels[:, :-1]
+            labels = labels[:, 1:]
+            labels_batch.attention_mask = labels_batch.attention_mask[:, 1:]
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.decoder_start_token_id)
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
+        labels = labels.filled(fill_value=-100)
+        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
+        bos_index = np.argmax(labels == self.decoder_start_token_id, axis=1)
+        prompt_mask = np.arange(labels.shape[1]) < bos_index[:, None]
+        labels = np.where(prompt_mask, -100, labels)
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+def get_data_loader(
+    seed: int,
+    dataset: IterableDataset,
+    batch_size: int,
+    data_collator: FlaxDataCollatorSpeechSeq2SeqWithPadding,
+    shuffle: bool = False,
+    drop_last: bool = True,
+    dataloader_num_workers: int = 0,
+    skip_batches: int = 0,
+    pin_memory: bool = True,
+    prefetch_size: int = 0,
+) -> DataLoader:
+    """
+    Returns batches of size `batch_size` from `dataset`. If `drop_last` is set to `False`, the final batch may be incomplete,
+    and range in size from 1 to `batch_size`. Shuffle batches if `shuffle` is `True`.
+    Args:
+        seed (int): Numpy seed for generating pseudo random numbers. Used if shuffling the dataset.
+        dataset (IterableDataset): streaming dataset from which to load the data.
+        batch_size (int): how many samples per batch to load.
+        data_collator (FlaxDataCollatorSpeechSeq2SeqWithPadding, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a map-style dataset.
+        shuffle (bool, optional): set to `True` to have the batches reshuffled.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        dataloader_num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        skip_batches (int, optional): Efficiently skip the first `skip_batches`.
+        pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
+            into device/CUDA pinned memory before returning them.  If your data elements
+            are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
+            see the example below.
+    """
+    if shuffle:
+        dataset = dataset.shuffle(seed)
+    if skip_batches > 0:
+        dataset = dataset.skip(skip_batches * batch_size)
+    if prefetch_size > 0:
+        dataset = IterableWrapper(dataset)
+        dataset = dataset.prefetch(prefetch_size)
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        drop_last=drop_last,
+        pin_memory=pin_memory,
+        collate_fn=data_collator,
+        num_workers=dataloader_num_workers,
+    )
+    return data_loader
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+            if regex_match is not None and regex_match.groups() is not None:
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def rotate_checkpoints(
+    save_total_limit=None, use_mtime=False, output_dir=None, checkpoint_prefix="checkpoint"
+) -> None:
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(
+        use_mtime=use_mtime, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+    )
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+def to_fp32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def to_bf16(t):
+    return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    max_grad_norm: float
+    def apply_gradients(self, *, grads, to_dtype: to_fp32, **kwargs):
+        """Updates `step`, `params`, `opt_state` and `**kwargs` in return value, clipping the
+        gradients by the maximum grad norm.
+        Note that internally this function calls `.tx.update()` followed by a call
+        to `optax.apply_updates()` to update `params` and `opt_state`.
+        Args:
+          grads: Gradients that have the same pytree structure as `.params`.
+          **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
+        Returns:
+          An updated instance of `self` with `step` incremented by one, `params`
+          and `opt_state` updated by applying `grads`, and additional attributes
+          replaced as specified by `kwargs`.
+        """
+        # clip gradients by global l2 norm
+        casted_max_grad_norm = to_dtype(self.max_grad_norm)
+        g_norm = linear_algebra.global_norm(grads)
+        g_norm = jnp.maximum(casted_max_grad_norm, g_norm)
+        grads = jax.tree_map(lambda t: (t / g_norm) * casted_max_grad_norm, grads)
+        # perform update step in fp32 and subsequently downcast optimizer states if mixed precision training
+        # grads and opt_state in bf16 (need to upcast), params in fp32 (leave as is)
+        updates, new_opt_state = self.tx.update(to_fp32(grads), to_fp32(self.opt_state), self.params)
+        new_params = optax.apply_updates(self.params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=new_params,
+            opt_state=to_dtype(new_opt_state),
+            **kwargs,
+        )
+    @classmethod
+    def create(cls, *, apply_fn, params, tx, to_dtype: to_fp32, **kwargs):
+        """Creates a new instance with `step=0` and initialized `opt_state`."""
+        # downcast optimizer state to bf16 if mixed-precision training
+        opt_state = tx.init(to_dtype(params))
+        return cls(
+            step=0,
+            apply_fn=apply_fn,
+            params=params,
+            tx=tx,
+            opt_state=opt_state,
+            **kwargs,
+        )
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+    def unreplicate(self):
+        return jax_utils.unreplicate(self)
+    def save_state(self, output_dir, save_total_limit=None, checkpoint_prefix="checkpoint"):
+        step = int(jax.device_get(unreplicate(self.step)))
+        serialized_state = to_bytes(self.unreplicate())
+        output_file = Path(os.path.join(output_dir, f"{checkpoint_prefix}-{step}", "train_state.msgpack"))
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        with output_file.open("wb") as f:
+            f.write(serialized_state)
+        logger.info(f"Flax train state saved in {output_file}")
+        rotate_checkpoints(
+            save_total_limit=save_total_limit, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+        )
+def save_hf_weights(
+    student_state: TrainState,
+    student_model: FlaxWhisperForConditionalGeneration,
+    processor: WhisperProcessor,
+    output_dir: str,
+    cur_step: int,
+    total_train_steps: int,
+    use_scan: bool = True,
+    checkpoint_prefix: str = "checkpoint",
+) -> None:
+    # always disable scan in the params / model so that we can load from PyTorch directly - this is a no-op if we're not using scan for training
+    student_state_params = unreplicate(student_state.params)
+    student_state_params = student_model.convert_scan_to_unroll(student_state_params)
+    student_params = jax.device_get(student_state_params)
+    student_model.disable_scan()
+    if cur_step != total_train_steps:
+        output_dir = os.path.join(output_dir, f"{checkpoint_prefix}-{cur_step}")
+        os.makedirs(output_dir, exist_ok=True)
+    student_model.save_pretrained(output_dir, params=student_params)
+    processor.save_pretrained(output_dir)
+    # re-enable scan only if required for training
+    if use_scan:
+        student_model.enable_scan()
+def write_train_metric(summary_writer, train_metrics, train_time, step, logging_steps):
+    summary_writer.scalar("train/time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        steps_arr = np.arange(0, step, logging_steps)[-len(vals) :]
+        tag = f"train/{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, steps_arr[i])
+def write_eval_metric(summary_writer, eval_metrics, step, prefix="eval"):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"{prefix}/{metric_name}", value, step)
+def write_wandb_metric(wandb_logger, metrics, train_time, step, epoch, prefix="train"):
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    wandb_logger.log(log_metrics, step)
+def write_wandb_pred(
+    wandb_logger, pred_str, label_str, norm_pred_str, norm_label_str, cur_step, prefix="eval", num_lines=200000
+):
+    # pretty name for current step: step 50000 -> step 50k
+    cur_step_pretty = f"{int(cur_step // 1000)}k" if cur_step > 1000 else cur_step
+    # convert str data to a wandb compatible format
+    str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data[:num_lines]
+            )
+        },
+        cur_step,
+    )
+    # log incorrect normalised predictions
+    str_data = np.asarray(str_data)
+    str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"incorrect_predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data_incorrect[:num_lines]
+            )
+        },
+        cur_step,
+    )
+def create_learning_rate_fn(
+    num_train_steps: int, lr_scheduler_type: str, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    lr_scheduler_types = ("linear", "constant_with_warmup")
+    if lr_scheduler_type not in lr_scheduler_types:
+        raise ValueError(
+            f"lr_scheduler_type of type {lr_scheduler_type} not supported, choose from {lr_scheduler_types}."
+        )
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate,
+        end_value=0 if lr_scheduler_type == "linear" else learning_rate,
+        transition_steps=num_train_steps - num_warmup_steps,
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    splits=None,
+    text_column_names=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        # we assume that all the datasets we're using derive from the distil-whisper org on the Hub - prepend the org name if necessary
+        for i in range(len(dataset_names)):
+            ds_name = dataset_names[i]
+            dataset_names[i] = f"distil-whisper/{ds_name}" if "/" not in ds_name else ds_name
+        dataset_config_names = dataset_config_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        text_column_names = text_column_names.split("+") if text_column_names is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+    if text_column_names is not None and len(text_column_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(text_column_names)} text column names."
+        )
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+    text_column_names = (
+        text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
+    )
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "text_column_name": text_column_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+def load_multiple_datasets(
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    text_column_names: Optional[List] = None,
+    sampling_rate: Optional[int] = 16000,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: bool = True,
+    seed: int = None,
+    **kwargs,
+) -> IterableDataset:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
+    )
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+    if len(dataset_names_dict) == 1:
+        dataset_dict = dataset_names_dict[0]
+        # we have a single dataset so just return it as is
+        return load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        dataset = load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+        # resample to specified sampling rate
+        dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
+        dataset = dataset.remove_columns(
+            set(dataset.features.keys()) - {"audio", dataset_dict["text_column_name"], "whisper_transcript"}
+        )
+        all_datasets.append(dataset)
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        interleaved_dataset = concatenate_datasets(all_datasets)
+    return interleaved_dataset
+def get_layers_to_supervise(student_layers: int, teacher_layers: int) -> dict:
+    """Helper function to map the student layer i to the teacher layer j whose output we'd like them to emulate. Used
+    for MSE loss terms in distillation (hidden-states and activations). Student layers are paired with teacher layers
+    in equal increments, e.g. for a 12-layer model distilled to a 3-layer model, student layer 0 emulates teacher layer
+    3 (such that it behaves like the first 4 teacher layers), student layer 1 emulates teacher layer 7, and student layer
+    2 emulates teacher layer 11. This mapping is summarised by the dictionary: {0: 3, 1: 7, 2: 11}, which is precisely
+    the output of this function for the arguments (student_layers=3, teacher_layers=12)."""
+    layer_intervals = np.linspace(teacher_layers // student_layers - 1, teacher_layers - 1, student_layers, dtype=int)
+    layer_intervals[-1] = teacher_layers - 1
+    layer_map = {}
+    for student_layer, teacher_layer in enumerate(layer_intervals):
+        layer_map[student_layer] = teacher_layer
+    return layer_map
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FlaxSeq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your JAX/Flax versions.
+    send_example_telemetry("run_flax_speech_recognition_seq2seq", model_args, data_args, framework="flax")
+    # 2. Define remote logging - do this early so that we get the full traceback on our remote logs
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard:
+        if jax.process_index() == 0:
+            try:
+                from flax.metrics.tensorboard import SummaryWriter
+                summary_writer = SummaryWriter(log_dir=os.path.join(Path(training_args.output_dir), "runs"))
+            except ImportError as ie:
+                has_tensorboard = False
+                logger.warning(
+                    "Unable to display metrics through TensorBoard because some package" f" are not installed: {ie}"
+                )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not"
+            " installed: Please run `pip install tensorboard` to enable."
+        )
+    # Enable wandb only on the master node
+    has_wandb = is_wandb_available()
+    if has_wandb:
+        import wandb as wandb_logger
+        # Set up wandb run
+        if jax.process_index() == 0:
+            wandb_logger.init(
+                project=data_args.wandb_project,
+                name=data_args.wandb_name,
+                job_type=data_args.wandb_job_type,
+                dir=data_args.wandb_dir,
+                save_code=data_args.save_code_to_wandb,
+            )
+    else:
+        logger.warning("Wandb logging requires wandb to be installed. Run `pip install wandb` to enable.")
+    # 3. Setup local logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Set the verbosity to info of the Transformers logger.
+    # We only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Check the output dir is valid
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not"
+            " empty. Use `--overwrite_output_dir` to overcome."
+        )
+    # 4. Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name,
+                token=training_args.hub_token,
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+        repo = Repository(
+            training_args.output_dir,
+            clone_from=repo_name,
+            token=training_args.hub_token,
+        )
+    if training_args.compilation_cache:
+        cc.initialize_cache(os.path.join(model_args.cache_dir, "jax_cache"))
+    # 5. Load dataset
+    raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # set seed for determinism
+    set_seed(training_args.seed)
+    if training_args.do_train:
+        print("loading raw")
+        raw_datasets["train"] = load_multiple_datasets(
+            data_args.train_dataset_name,
+            data_args.train_dataset_config_name,
+            splits=data_args.train_split_name,
+            streaming=data_args.streaming,
+            dataset_samples=data_args.train_dataset_samples,
+            seed=training_args.seed,
+            cache_dir=data_args.dataset_cache_dir,
+            token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_eval:
+        dataset_names_dict = convert_dataset_str_to_list(
+            data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+            (
+                data_args.eval_dataset_config_name
+                if data_args.eval_dataset_config_name
+                else data_args.train_dataset_config_name
+            ),
+            splits=data_args.eval_split_name,
+            text_column_names=data_args.eval_text_column_name,
+        )
+        all_eval_splits = []
+        if len(dataset_names_dict) == 1:
+            # load a single eval set
+            dataset_dict = dataset_names_dict[0]
+            all_eval_splits.append("eval")
+            raw_datasets["eval"] = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                cache_dir=data_args.dataset_cache_dir,
+                token=True if model_args.use_auth_token else None,
+                streaming=data_args.streaming,
+            )
+        else:
+            # load multiple eval sets
+            for dataset_dict in dataset_names_dict:
+                if dataset_dict["name"] == "esb/diagnostic-dataset":
+                    # for the ESB diagnostic dataset, the dataset name is effectively the config
+                    pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
+                else:
+                    pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
+                all_eval_splits.append(pretty_name)
+                raw_datasets[pretty_name] = load_dataset(
+                    dataset_dict["name"],
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    cache_dir=data_args.dataset_cache_dir,
+                    token=True if model_args.use_auth_token else None,
+                    streaming=data_args.streaming,
+                )
+                features = raw_datasets[pretty_name].features.keys()
+                if "text" not in features:
+                    raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
+                        dataset_dict["text_column_name"], "text"
+                    )
+                raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
+                    set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
+                )
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+    raw_datasets_train_features = list(raw_datasets["train"].features.keys())
+    print("debug 1")
+    if data_args.audio_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--audio_column_name` to"
+            " the correct audio column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    if data_args.train_text_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--train_text_column_name {data_args.train_text_column_name} not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--train_text_column_name` to the"
+            " correct text column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    # 6. Load pretrained model, tokenizer, and feature extractor
+    config = WhisperConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(
+        (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = WhisperTokenizerFast.from_pretrained(
+        (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    print("debug2")
+    # override timestamp tokens until tokenizer issues are fixed in transformers
+    timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
+    tokenizer.add_tokens(timestamps)
+    config.update(
+        {
+            "activation_dropout": model_args.activation_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "dropout": model_args.dropout,
+        }
+    )
+    if training_args.precision == "full_mixed":
+        # forward pass, backward pass and optimiser states in bf16
+        dtype = jnp.bfloat16
+        to_dtype = to_bf16
+    elif training_args.precision == "half_mixed" or model_args.dtype == "bfloat16":
+        # forward pass in bf16, backward pass and optimiser states in fp32
+        dtype = jnp.bfloat16
+        to_dtype = to_fp32
+    else:
+        if training_args.precision != "full":
+            raise ValueError(
+                f"`precision` should be one of: `full`, `half_mixed` or `full_mixed`, got {training_args.precision}"
+            )
+        # forward pass, backward pass and optimiser states in fp32
+        dtype = jnp.float32
+        to_dtype = to_fp32
+    student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        subfolder=model_args.subfolder,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+        use_scan=model_args.load_with_scan_weights,
+    )
+    teacher_model, teacher_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.teacher_model_name_or_path,
+        # config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        # revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+    )
+    print("debug 3")
+    if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
+        raise ValueError(
+            f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
+            f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
+            f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
+        )
+    # enable scan / gradient checkpointing if necessary
+    if training_args.use_scan:
+        student_model.enable_scan()  # to enable scan in the nn.Module
+        student_params = student_model.convert_unroll_to_scan(student_params)  # to convert the unrolled params to scan
+        teacher_model.enable_scan()  # faster compile time (even though we don't train the teacher)
+        teacher_params = teacher_model.convert_unroll_to_scan(teacher_params)
+    if training_args.gradient_checkpointing:
+        student_model.enable_gradient_checkpointing()  # to enable checkpointing in the nn.Module, there is no change to the params structure
+        teacher_model.enable_gradient_checkpointing()
+    print("debug 4")
+    if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
+        # We need to set the language and task ids for previously multilingual checkpoints - for now we hardcode this to Norwegian
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=False)
+        student_model.generation_config.update(
+            **{
+                "language": "<|no|>",
+                "task": "transcribe",
+            }
+        )
+    print("debug 5")
+    # 7. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name,
+        datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
+    )
+    # 8. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    )
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    dataloader_num_workers = training_args.dataloader_num_workers
+    dataloader_prefetch_size = data_args.prefetch_size
+    train_text_column_name = data_args.train_text_column_name
+    eval_text_column_name = "text"
+    model_input_name = feature_extractor.model_input_names[0]
+    normalizer = BasicTextNormalizer(tokenizer.english_spelling_normalizer)
+    wer_threshold = data_args.wer_threshold
+    round_timestamps = data_args.round_timestamps
+    print("debug 6")
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = (
+            raw_datasets["train"].take(data_args.max_train_samples)
+            if data_args.streaming
+            else raw_datasets["train"].select(range(data_args.max_train_samples))
+        )
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        for eval_split in all_eval_splits:
+            raw_datasets[eval_split] = (
+                raw_datasets[eval_split].take(data_args.max_eval_samples)
+                if data_args.streaming
+                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+            )
+    print("debug 7")
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    def is_wer_in_range(ground_truth, whisper_transcript):
+        norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
+            norm_whisper_transcript = normalizer(whisper_transcript)
+            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+            return wer < wer_threshold
+        else:
+            # filter automatically since we cant know WER
+            return False
+    filter_by_wer_threshold = partial(
+        raw_datasets["train"].filter,
+        function=is_wer_in_range,
+        input_columns=[eval_text_column_name, train_text_column_name],
+    )
+    if wer_threshold is not None:
+        raw_datasets["train"] = (
+            filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+            if not data_args.streaming
+            else filter_by_wer_threshold()
+        )
+    def has_timestamp_tokens(input_str):
+        """
+        Identify whether the input string contains timestamp tokens, of the form <|0.00|>, by searching for
+        pairs of left and right-angle brackets.
+        """
+        return bool(re.search("\<[^\>]*\>", input_str))
+    def round_timestamp_tokens(input_str: str, ndigits: int = 1):
+        timestamps = re.findall("\<[^\>]*\>", input_str, re.DOTALL)
+        for token in timestamps:
+            # extract time digits from timestamp token, e.g. <|6.24|> to 6.24
+            time_digit = token[2:-2]
+            # round to specified number of digits, e.g. 6.24 to 6.2
+            time_digit = round(float(time_digit), ndigits=ndigits)
+            # replace in original string with the same precision, e.g. <|6.24|> to <|6.20|>
+            input_str = input_str.replace(token, "<|{:.2f}|>".format(time_digit))
+        return input_str
+    def prepare_train_dataset(batch):
+        # process audio input
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process text targets
+        input_str = batch[train_text_column_name]
+        # prompt & timestamp processing: for now, we only do one or the other
+        if input_str.startswith("<|startoftranscript|>") or input_str.startswith("<|startofprev|>"):
+            # prompted target text already has special ids added, so don't add them here
+            batch["labels"] = tokenizer(input_str, add_special_tokens=False).input_ids
+            return batch
+        has_timestamps = has_timestamp_tokens(input_str)
+        if has_timestamps:
+            predict_timestamps = bool(np.random.binomial(1, data_args.timestamp_probability))
+            if not predict_timestamps:
+                # filter timestamp token ids if not part of the prediction task
+                input_str = tokenizer._filter_timestamp_ids(input_str)
+            elif round_timestamps:
+                input_str = round_timestamp_tokens(input_str)
+        else:
+            predict_timestamps = False
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=predict_timestamps)
+        input_ids = tokenizer(input_str).input_ids
+        batch["labels"] = input_ids
+        return batch
+    def prepare_eval_dataset(batch):
+        # process audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        # process audio length
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process targets
+        input_str = batch[eval_text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    if training_args.do_train:
+        map_fn_train = partial(
+            raw_datasets["train"].map, function=prepare_train_dataset, remove_columns=raw_datasets_train_features
+        )
+        vectorized_datasets["train"] = (
+            map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+            if not data_args.streaming
+            else map_fn_train()
+        )
+    if training_args.do_eval:
+        for eval_split in all_eval_splits:
+            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial(
+                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+            )
+            vectorized_datasets[eval_split] = (
+                map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+                if not data_args.streaming
+                else map_fn_eval()
+            )
+    # filter training data with inputs longer than max_input_length
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    filter_by_audio_fn = partial(
+        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    )
+    vectorized_datasets = (
+        filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+        if not data_args.streaming
+        else filter_by_audio_fn()
+    )
+    # filter training data with labels longer than max_label_length
+    def is_labels_in_length_range(labels):
+        return 0 < len(labels) < max_label_length
+    filter_by_labels_fn = partial(
+        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    )
+    vectorized_datasets = (
+        filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+        if not data_args.streaming
+        else filter_by_labels_fn()
+    )
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 8. Load Metric
+    metric = evaluate.load("wer")
+    # convention is that we space all punctuation *except* apostrophes
+    all_punctuation = list(string.punctuation.replace("'", ""))
+    return_timestamps = data_args.return_timestamps if data_args.timestamp_probability > 0 else False
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # space punctuation for orthographic WER (c.f. ESB paper https://arxiv.org/abs/2210.13352)
+        spaced_pred_str = [
+            pred_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(pred_str))
+        ]
+        spaced_label_str = [
+            label_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(label_str))
+        ]
+        wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nospeech|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nospeech|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nospeech|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nospeech|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nospeech|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # 9. Save feature extractor, tokenizer, config and generation config
+    feature_extractor.save_pretrained(training_args.output_dir)
+    tokenizer.save_pretrained(training_args.output_dir)
+    config.save_pretrained(training_args.output_dir)
+    student_model.generation_config.save_pretrained(
+        training_args.output_dir
+    )  # generation config stays bound to model to make it easy to jit
+    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=student_model.config.decoder_start_token_id,  # <|startoftranscript|>
+        decoder_prev_token_id=tokenizer.all_special_ids[-3],  # <|startofprev|>
+        input_padding="longest",
+        target_padding="max_length",
+        max_target_length=max_label_length,
+    )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constants
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    eval_batch_size = per_device_eval_batch_size * jax.device_count()
+    if not data_args.streaming and training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // train_batch_size
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+    else:
+        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    if training_args.eval_steps is None:
+        logger.info(
+            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+        )
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        total_train_steps * gradient_accumulation_steps,
+        training_args.lr_scheduler_type,
+        training_args.warmup_steps * gradient_accumulation_steps,
+        training_args.learning_rate,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        # find out all LayerNorm parameters
+        layer_norm_candidates = [
+            "layer_norm",
+            "self_attn_layer_norm",
+            "final_layer_norm",
+            "encoder_attn_layer_norm",
+        ]
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
+        flat_mask = {path: path[-1] != "bias" and path[-2:] not in layer_norm_named_params for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    if gradient_accumulation_steps > 1:
+        # accumulate gradients and apply once every k steps
+        adamw = optax.MultiSteps(adamw, every_k_schedule=gradient_accumulation_steps)
+    share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
+    encoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.encoder_layers, teacher_model.config.encoder_layers
+    )
+    decoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.decoder_layers, teacher_model.config.decoder_layers
+    )
+    # Setup train state
+    student_state = TrainState.create(
+        apply_fn=student_model.decode if share_hidden_states else student_model.__call__,
+        params=student_params,
+        tx=adamw,
+        to_dtype=to_dtype,
+        dropout_rng=dropout_rng,
+        max_grad_norm=training_args.max_grad_norm,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        if os.path.isfile(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")):
+            logger.info(
+                f"Checkpoint detected, resuming training at {training_args.resume_from_checkpoint}. To avoid "
+                "this behavior, omit the resume_from_checkpoint argument."
+            )
+            with Path(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")).open("rb") as f:
+                student_state = from_bytes(student_state, f.read())
+        else:
+            logger.warning(
+                f"Checkpoint {training_args.resume_from_checkpoint} not detected, training from scratch. Ensure "
+                f"you pass the path to a folder with a valid checkpoint for your model."
+            )
+    def cross_entropy_loss(logits, labels):
+        vocab_size = logits.shape[-1]
+        # optax onehot always returns a float32 device array, need to downcast if performing mixed precision training
+        onehot_targets = to_dtype(onehot(labels, vocab_size))
+        loss = optax.softmax_cross_entropy(logits, onehot_targets)
+        # ignore padded tokens from loss, i.e. where labels are not set to -100
+        padding = labels >= 0
+        loss = loss * padding
+        loss = loss.sum()
+        num_labels = padding.sum()
+        return loss, num_labels
+    # temperature smoothed kl-divergence
+    def kl_divergence(target_distribution, log_predicted_distribution, labels, eps=1e-20):
+        divergence = -target_distribution * (log_predicted_distribution - jnp.log(target_distribution + eps))
+        # ignore padded tokens from divergence, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        padding_mask = jnp.expand_dims(padding_mask, axis=-1)
+        divergence = (divergence * padding_mask).sum()
+        return to_dtype(divergence)  # respect the dtype of the backprop
+    def mean_square_error_loss(student_outputs, teacher_outputs):
+        mse = dtype(0.0)
+        # tie encoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.encoder_hidden_states[0] - student_outputs.encoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in encoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.encoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.encoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.encoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.encoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+        # tie decoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.decoder_hidden_states[0] - student_outputs.decoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in decoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.decoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.decoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.decoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.decoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+            # student_cross_attention = student_outputs.cross_attentions[student_layer_id]
+            # teacher_cross_attention = teacher_outputs.cross_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_cross_attention - teacher_cross_attention))
+        return to_dtype(mse)  # respect the dtype of the backprop
+    # Define gradient update step fn
+    def train_step(
+        student_state,
+        teacher_params,
+        batch,
+        freeze_encoder,
+        share_hidden_states,
+        temperature=2.0,
+    ):
+        dropout_rng, new_dropout_rng = jax.random.split(student_state.dropout_rng)
+        def compute_loss(student_params):
+            labels = batch.pop("labels")
+            output_hidden_states = not share_hidden_states and training_args.mse_weight > 0.0
+            teacher_outputs = teacher_model(
+                **batch,
+                params=teacher_params,
+                freeze_encoder=True,
+                output_hidden_states=output_hidden_states,
+                train=False,
+            )
+            if share_hidden_states:
+                # if the student and teacher share the same frozen encoder then we don't have to recompute the
+                # encoder hidden-states for the student model, we can just re-use from the teacher
+                encoder_hidden_states = jax.lax.stop_gradient(teacher_outputs.encoder_last_hidden_state)
+                encoder_outputs = FlaxBaseModelOutput(last_hidden_state=encoder_hidden_states)
+                student_outputs = student_state.apply_fn(
+                    decoder_input_ids=batch["decoder_input_ids"],
+                    encoder_outputs=encoder_outputs,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    train=True,
+                )
+            else:
+                # do the full forward pass for the student model (encoder + decoder)
+                student_outputs = student_state.apply_fn(
+                    **batch,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    freeze_encoder=freeze_encoder,
+                    output_hidden_states=output_hidden_states,
+                    train=True,
+                )
+            # CE (data) loss
+            ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+            # rescale by temperature to ensure gradients scale correctly
+            teacher_distribution = jax.nn.softmax(teacher_outputs.logits / temperature, axis=-1)
+            # ensure no information flow backwards through teacher
+            teacher_distribution = jax.lax.stop_gradient(teacher_distribution)
+            # log softmax of student predictions for numerical stability
+            student_distribution = jax.nn.log_softmax(student_outputs.logits / temperature, axis=-1)
+            # KL-divergence loss (scaled by temperature)
+            kl_loss = kl_divergence(teacher_distribution, student_distribution, labels) * temperature**2
+            # MSE loss between enc-dec hidden-states and attentions
+            mse_loss = (
+                mean_square_error_loss(student_outputs, teacher_outputs)
+                if output_hidden_states
+                else jnp.zeros_like(kl_loss)
+            )
+            # use DistilBart formulation - only tune the MSE weight and take remaining HPs from DistilBERT
+            ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+            loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+            return loss, (
+                ce_loss,
+                kl_loss,
+                mse_loss,
+                num_labels,
+            )
+        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
+        (loss, (ce_loss, kl_loss, mse_loss, num_labels)), grad = grad_fn(to_dtype(student_state.params))
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # true grad = total grad / total samples
+        grad = jax.lax.psum(grad, "batch")
+        grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
+        new_state = student_state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng, to_dtype=to_dtype)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {
+            "loss": loss,
+            "learning_rate": linear_decay_lr_schedule_fn(student_state.step),
+            "ce_loss": ce_loss,
+            "kl_loss": kl_loss,
+            "mse_loss": mse_loss,
+        }
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(student_params, teacher_params, batch):
+        labels = batch.pop("labels")
+        output_hidden_states = not share_hidden_states and training_args.mse_weight > 0
+        student_outputs = student_model(
+            **batch,
+            params=student_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        student_distribution = jax.nn.log_softmax(student_outputs.logits, axis=-1)
+        ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+        teacher_outputs = teacher_model(
+            **batch,
+            params=teacher_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        teacher_distribution = jax.nn.softmax(teacher_outputs.logits, axis=-1)
+        # temperature is always 1 for eval
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, labels)
+        mse_loss = (
+            mean_square_error_loss(student_outputs, teacher_outputs)
+            if output_hidden_states
+            else jnp.zeros_like(kl_loss)
+        )
+        ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+        loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss, "mse_loss": mse_loss}
+        return metrics
+    # Define generation function
+    num_beams = (
+        training_args.generation_num_beams
+        if training_args.generation_num_beams is not None
+        else student_model.config.num_beams
+    )
+    # forcing the language and task tokens helps the model in its generations
+    gen_kwargs = {
+        "max_length": max_label_length,
+        "num_beams": num_beams,
+        "language": "<|en|>",
+        "task": "transcribe",
+        "return_timestamps": return_timestamps,
+    }
+    def generate_step(student_params, batch):
+        output_ids = student_model.generate(
+            batch[model_input_name],
+            attention_mask=batch.get("attention_mask"),
+            params=student_params,
+            **gen_kwargs,
+        )
+        return output_ids.sequences
+    # Replicate the train state on each device
+    student_state = student_state.replicate()
+    # Replicate the teacher params on each device
+    teacher_params = jax_utils.replicate(teacher_params)
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(
+        train_step,
+        "batch",
+        in_axes=(0, 0, 0, None, None, None),
+        donate_argnums=(0,),
+        static_broadcasted_argnums=(
+            3,
+            4,
+        ),
+    )
+    p_eval_step = jax.pmap(eval_step, "batch")
+    p_generate_step = jax.pmap(generate_step, "batch")
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    train_metrics = []
+    batches_to_skip = jax.device_get(unreplicate(student_state.step))
+    cur_step = int(batches_to_skip)  # will be zero if starting from scratch
+    epochs_trained = batches_to_skip // steps_per_epoch
+    steps_trained_progress_bar = tqdm(range(total_train_steps), desc="Train steps ... ", position=0)
+    steps_trained_progress_bar.update(batches_to_skip)
+    continue_training = True
+    minibatch_steps = 0
+    print("Debug 8")
+    if batches_to_skip > 0:
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {batches_to_skip}")
+    print("debug 9")
+    # Generate a training data loader by shuffling sampling indices from the train dataset
+    train_loader = get_data_loader(
+        training_args.seed,
+        vectorized_datasets["train"],
+        batch_size=train_batch_size,
+        data_collator=data_collator,
+        dataloader_num_workers=dataloader_num_workers,
+        skip_batches=batches_to_skip,
+        prefetch_size=dataloader_prefetch_size,
+    )
+    print("debug 10")
+    for epoch in range(epochs_trained, num_epochs):
+        print("Debug 11")
+        if hasattr(train_loader, "dataset") and isinstance(train_loader.dataset, IterableDataset):
+            print("Debug 11B")
+            train_loader.dataset.set_epoch(epoch)
+            breakpoint()
+        print("debug 12")
+        for batch in train_loader:
+            print("debug 13")
+            minibatch_steps += 1
+            update_step = minibatch_steps == gradient_accumulation_steps
+            if update_step:
+                steps_trained_progress_bar.update(1)
+                cur_step += 1
+                minibatch_steps = 0
+            print("debug 14")
+            batch = shard(batch.data)
+            student_state, train_metric = p_train_step(
+                student_state,
+                teacher_params,
+                batch,
+                training_args.freeze_encoder,
+                share_hidden_states,
+                training_args.temperature,
+            )
+            print("debug 15")
+            if cur_step % training_args.logging_steps == 0 and update_step:
+                train_metrics.append(train_metric)
+                train_metric_to_write = unreplicate(train_metric)
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric_to_write['loss']}, Learning Rate:"
+                    f" {train_metric_to_write['learning_rate']})"
+                )
+                print("debug 16")
+                if has_wandb and jax.process_index() == 0:
+                    write_wandb_metric(
+                        wandb_logger,
+                        train_metric_to_write,
+                        train_time + time.time() - train_start,
+                        cur_step,
+                        epoch,
+                        prefix="train",
+                    )
+            print("debug 17")
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0 and update_step) or cur_step == total_train_steps:
+                if jax.process_index() == 0:
+                    save_hf_weights(
+                        student_state,
+                        student_model,
+                        processor,
+                        training_args.output_dir,
+                        cur_step,
+                        total_train_steps,
+                        use_scan=training_args.use_scan,
+                    )
+                    if training_args.save_train_state:
+                        student_state.save_state(
+                            training_args.output_dir, save_total_limit=training_args.save_total_limit
+                        )
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(
+                            commit_message=f"Saving train state of step {cur_step}",
+                            blocking=False,
+                        )
+            if training_args.do_eval and (
+                (cur_step % eval_steps == 0 and update_step) or cur_step == total_train_steps
+            ):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                for eval_split in all_eval_splits:
+                    eval_metrics = []
+                    eval_preds = []
+                    eval_labels = []
+                    eval_start = time.time()
+                    eval_loader = get_data_loader(
+                        training_args.seed,
+                        vectorized_datasets[eval_split],
+                        batch_size=eval_batch_size,
+                        data_collator=data_collator,
+                        shuffle=False,
+                        drop_last=False,
+                        dataloader_num_workers=dataloader_num_workers,
+                    )
+                    for batch in tqdm(eval_loader, desc=f"Evaluating {eval_split}...", position=2):
+                        # Model forward
+                        labels = batch["labels"]
+                        metrics = pad_shard_unpad(
+                            p_eval_step,
+                            static_argnums=(
+                                0,
+                                1,
+                            ),
+                            static_return=True,
+                        )(
+                            student_state.params,
+                            teacher_params,
+                            batch.data,
+                            min_device_batch=per_device_eval_batch_size,
+                        )
+                        eval_metrics.append(metrics)
+                        # generation
+                        if training_args.predict_with_generate:
+                            generated_ids = pad_shard_unpad(p_generate_step)(
+                                student_state.params, batch.data, min_device_batch=per_device_eval_batch_size
+                            )
+                            eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                            eval_labels.extend(labels)
+                    eval_time = time.time() - eval_start
+                    # normalize eval metrics
+                    eval_metrics = get_metrics(eval_metrics)
+                    eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
+                    # compute WER metric
+                    wer_desc = ""
+                    if training_args.predict_with_generate:
+                        wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+                            eval_preds, eval_labels
+                        )
+                        eval_metrics.update(wer_metric)
+                        wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+                    # Print metrics and update progress bar
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {wer_desc})"
+                    )
+                    if has_tensorboard and jax.process_index() == 0:
+                        write_eval_metric(
+                            summary_writer,
+                            eval_metrics,
+                            cur_step,
+                            prefix=eval_split,
+                        )
+                    if has_wandb and jax.process_index() == 0:
+                        write_wandb_metric(wandb_logger, eval_metrics, eval_time, cur_step, epoch, prefix=eval_split)
+                        if training_args.predict_with_generate:
+                            write_wandb_pred(
+                                wandb_logger,
+                                pred_str,
+                                label_str,
+                                norm_pred_str,
+                                norm_label_str,
+                                cur_step,
+                                prefix=eval_split,
+                            )
+                if has_tensorboard and jax.process_index() == 0:
+                    # we'll only log to tensorboard every eval steps
+                    write_train_metric(
+                        summary_writer,
+                        train_metrics,
+                        train_time,
+                        cur_step,
+                        training_args.logging_steps,
+                    )
+                # flush the train metrics
+                train_start = time.time()
+                train_metrics = []
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+        if not continue_training:
+            break
+if __name__ == "__main__":
+    main()

run_distillation_nodes.py ADDED Viewed

	@@ -0,0 +1,2168 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
+"""
+# You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
+import logging
+import os
+import re
+import shutil
+import string
+import sys
+import time
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+import datasets
+import evaluate
+import flax
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import transformers
+from datasets import (
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    concatenate_datasets,
+    interleave_datasets,
+    load_dataset,
+)
+from datasets.distributed import split_dataset_by_node
+from flax import jax_utils, traverse_util
+from flax.jax_utils import pad_shard_unpad, unreplicate
+from flax.serialization import from_bytes, to_bytes
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository, create_repo
+from jax.experimental.compilation_cache import compilation_cache as cc
+from optax._src import linear_algebra
+from torch.utils.data import DataLoader
+from torchdata.datapipes.iter import IterableWrapper
+from tqdm import tqdm
+from transformers import (
+    AddedToken,
+    HfArgumentParser,
+    Seq2SeqTrainingArguments,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperProcessor,
+    WhisperTokenizerFast,
+    is_tensorboard_available,
+    is_wandb_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput
+from transformers.models.whisper.english_normalizer import BasicTextNormalizer,EnglishTextNormalizer
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+from distil_whisper import FlaxWhisperForConditionalGeneration
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.27.0.dev0")
+require_version(
+    "datasets>=1.18.0",
+    "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt",
+)
+logger = logging.getLogger(__name__)
+@flax.struct.dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained student model or model identifier from huggingface.co/models")}
+    )
+    teacher_model_name_or_path: str = field(
+        metadata={"help": ("Path to pretrained teacher model or model identifier from huggingface.co/models")}
+    )
+    config_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained config name or path if not the same as model_name"},
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "feature extractor name or path if not the same as model_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": ("Where to store the pretrained models downloaded from huggingface.co")},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": ("Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.")},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": ("The specific model version to use (can be a branch name, tag name or commit id).")},
+    )
+    subfolder: str = field(
+        default="",
+        metadata={
+            "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
+            "specify the folder name here."
+        },
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Will use the token generated when running `transformers-cli login`"
+                " (necessary to use this script with private models)."
+            )
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "Floating-point format in which the model weights should be initialized"
+                " and trained. Choose one of `[float32, float16, bfloat16]`."
+            )
+        },
+    )
+    load_with_scan_weights: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the pre-trained checkpoint has its weights stored in scan format. Set to True for scanned "
+            "weights, defaults to False for non-scan (unrolled) weights."
+        },
+    )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for activations inside the fully connected layer."},
+    )
+    attention_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout ratio for the attention probabilities."},
+    )
+    dropout: float = field(
+        default=0.0,
+        metadata={
+            "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+@flax.struct.dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to cache directory for saving and loading datasets"},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets"},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " training examples to this value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of"
+                " evaluation examples to this value if set."
+            )
+        },
+    )
+    audio_column_name: str = field(
+        default="audio",
+        metadata={"help": ("The name of the dataset column containing the audio data. Defaults to 'audio'")},
+    )
+    train_text_column_name: str = field(
+        default="whisper_transcript",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'whisper_transcript'which is the pseudo-labelled Whisper"
+                " transcription data."
+            )
+        },
+    )
+    eval_text_column_name: str = field(
+        default="text",
+        metadata={
+            "help": (
+                "The name of the dataset column containing the text data. Defaults to"
+                " 'text', which is the original text data"
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=30.0,
+        metadata={"help": ("Filter audio files that are longer than `max_duration_in_seconds` seconds")},
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0,
+        metadata={"help": ("Filter audio files that are shorter than `min_duration_in_seconds` seconds")},
+    )
+    max_label_length: int = field(
+        default=128,
+        metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
+    )
+    pad_target_to_multiple_of: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set will pad the target sequence to a multiple of the provided"
+                " value. This is important to avoid triggering recompilations on TPU."
+                " If unspecified, will default to padding the targets to max length."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is"
+                " especially useful when data preprocessing errors out in distributed"
+                " training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with"
+                " `preprocessing_only=True` so that the cached datasets can"
+                " consequently be loaded in distributed training"
+            )
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    eval_split_name: str = field(
+        default="validation",
+        metadata={
+            "help": (
+                "The name of the evaluation data set split to use (via the datasets"
+                " library). Defaults to 'validation'"
+            )
+        },
+    )
+    wandb_project: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb project."},
+    )
+    wandb_name: str = field(
+        default=None,
+        metadata={"help": "The name of the wandb run."},
+    )
+    wandb_job_type: str = field(
+        default="distil-whisper",
+        metadata={"help": "The name of the wandb job type."},
+    )
+    wandb_dir: str = field(
+        default=None,
+        metadata={"help": "The absolute path to save the wandb logs."},
+    )
+    save_code_to_wandb: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to save main script to wandb. This is valuable for improving"
+                " experiment reproducibility and to diff code across experiments in"
+                " the UI."
+            )
+        },
+    )
+    streaming: bool = field(
+        default=True,
+        metadata={"help": "Whether to use Datasets' streaming mode to load and the data."},
+    )
+    wer_threshold: float = field(
+        default=None,
+        metadata={
+            "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
+            "WER with the normalised transcriptions."
+        },
+    )
+    prefetch_size: int = field(
+        default=0,
+        metadata={"help": "Number of samples to pre-fetch if using an iterable dataset."},
+    )
+    timestamp_probability: float = field(
+        default=0.5, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
+    )
+    return_timestamps: bool = field(
+        default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
+    )
+    round_timestamps: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to round the timestamp tokens to the nearest tenth of a second."
+            "By default, Whisper predicts timestamps to the nearest hundredth of a second."
+            "Reducing the timestamp precision to one tenth of a second simplifies the timestamp"
+            "prediction task, at the expense of timestamp granularity."
+        },
+    )
+@dataclass
+class FlaxSeq2SeqTrainingArguments(Seq2SeqTrainingArguments):
+    use_scan: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether or not to use `scan_with_axes` over the encoder and decoder blocks. Using scan results "
+                "in faster compile times and more efficient memory use during training, since all of the layers "
+                "in the encoder/decoder are stacked, and we perform a lax.scan over the stacked block to index "
+                "each layer. However, it results in slower inference time due to the overhead of stacking the "
+                "layers this way. Thus, we **always** default to disabling scan for the inference step."
+            )
+        },
+    )
+    freeze_encoder: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
+                "copied from the teacher model."
+            )
+        },
+    )
+    temperature: Optional[float] = field(
+        default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
+    )
+    kl_weight: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    mse_weight: Optional[float] = field(
+        default=0.0,
+        metadata={
+            "help": (
+                "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
+                "computed between the teacher-student hidden states and attentions."
+            )
+        },
+    )
+    precision: Optional[str] = field(
+        default="half_mixed",
+        metadata={
+            "help": (
+                "Precision with which run training, Can be one of `full`, `half_mixed` or `full_mixed`, the latter two"
+                "of which enable *mixed-precision* training. **Note that this only specifies the dtype of the computation "
+                "and optimizer state. It does not influence the dtype of model parameters.** An explanation of the three "
+                "settings is provided below:"
+                "   1. Full precision: forward pass, backward pass and optimiser states all in float32."
+                "   2. Half mixed precision: forward pass in bfloat16, backward pass and optimiser states in float32. This "
+                "   corresponds to setting the dtype argument to bfloat16 when instantiating the model."
+                "   3. Full mixed precision: forward pass, backward pass and optimiser states all in bfloat16. The dtype "
+                "   argument is set to bfloat16 for the forward pass, and the gradients computed with respect to the bfloat16 "
+                "   parameters in the backward pass (giving bfloat16 gradients). The new optimiser states and parameter "
+                "   updates are computed in float32 by upcasting the bfloat16 gradients and optimiser states to float32 "
+                "   prior to the optimiser update step. The optimiser states are returned in float32 (but not saved to "
+                "   memory) and then downcasted to bfloat16 (saved to memory) for the subsequent train step."
+                "For further details, refer to https://github.com/deepmind/optax/discussions/336"
+            )
+        },
+    )
+    compilation_cache: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable the JAX (experimental) compilation cache. The compilation step is *cached* the "
+                "first time it is run. Successive compilation steps for the same function utilise the cache to reduce"
+                "the compilation time."
+            )
+        },
+    )
+    save_train_state: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to save the Flax Train State on each `save_steps` steps. Required if you intend"
+            "to resume training from partial training runs. If False, only the model weights will be saved."
+            "If True, both the model weights and Flax Train state will be saved."
+        },
+    )
+def shift_tokens_right(label_ids: np.array, decoder_start_token_id: int) -> np.ndarray:
+    """
+    Shift label ids one token to the right.
+    """
+    shifted_label_ids = np.zeros_like(label_ids)
+    shifted_label_ids[:, 1:] = label_ids[:, :-1]
+    shifted_label_ids[:, 0] = decoder_start_token_id
+    return shifted_label_ids
+@flax.struct.dataclass
+class FlaxDataCollatorSpeechSeq2SeqWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor ([`Wav2Vec2Processor`])
+            The processor used for proccessing the data.
+        decoder_start_token_id (:obj: `int`)
+            The start-of-sequence token id of the decoder.
+        decoder_prev_token_id (:obj: `int`)
+            The start-of-prompt token id of the decoder
+        input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
+            See above for details.
+        max_target_length (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
+    """
+    processor: Any
+    decoder_start_token_id: int
+    decoder_prev_token_id: int
+    input_padding: Union[bool, str] = "max_length"
+    target_padding: Union[bool, str] = "max_length"
+    max_target_length: Optional[int] = None
+    def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        model_input_name = self.processor.model_input_names[0]
+        # dataloader returns a list of features which we convert to a dict
+        input_features = {model_input_name: [feature[model_input_name] for feature in features]}
+        label_features = {"input_ids": [feature["labels"] for feature in features]}
+        # reformat list to dict and set to pytorch format
+        batch = self.processor.feature_extractor.pad(
+            input_features,
+            padding=self.input_padding,
+            return_tensors="np",
+        )
+        labels_batch = self.processor.tokenizer.pad(
+            label_features,
+            max_length=self.max_target_length,
+            padding=self.target_padding,
+            return_tensors="np",
+        )
+        # if bos token is appended in previous tokenization step,
+        # cut bos token here as it's append later anyways
+        labels = labels_batch["input_ids"]
+        if set(np.unique(labels[:, 0])).issubset({self.decoder_start_token_id, self.decoder_prev_token_id}):
+            decoder_input_ids = labels[:, :-1]
+            labels = labels[:, 1:]
+            labels_batch.attention_mask = labels_batch.attention_mask[:, 1:]
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.decoder_start_token_id)
+        # replace padding with -100 to ignore correctly when computing the loss
+        labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
+        labels = labels.filled(fill_value=-100)
+        # replace initial prompt tokens with -100 to ignore correctly when computing the loss
+        bos_index = np.argmax(labels == self.decoder_start_token_id, axis=1)
+        prompt_mask = np.arange(labels.shape[1]) < bos_index[:, None]
+        labels = np.where(prompt_mask, -100, labels)
+        batch["labels"] = labels
+        batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+def get_data_loader(
+    seed: int,
+    dataset: IterableDataset,
+    batch_size: int,
+    data_collator: FlaxDataCollatorSpeechSeq2SeqWithPadding,
+    shuffle: bool = False,
+    drop_last: bool = True,
+    dataloader_num_workers: int = 0,
+    skip_batches: int = 0,
+    pin_memory: bool = True,
+    prefetch_size: int = 0,
+) -> DataLoader:
+    """
+    Returns batches of size `batch_size` from `dataset`. If `drop_last` is set to `False`, the final batch may be incomplete,
+    and range in size from 1 to `batch_size`. Shuffle batches if `shuffle` is `True`.
+    Args:
+        seed (int): Numpy seed for generating pseudo random numbers. Used if shuffling the dataset.
+        dataset (IterableDataset): streaming dataset from which to load the data.
+        batch_size (int): how many samples per batch to load.
+        data_collator (FlaxDataCollatorSpeechSeq2SeqWithPadding, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a map-style dataset.
+        shuffle (bool, optional): set to `True` to have the batches reshuffled.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        dataloader_num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        skip_batches (int, optional): Efficiently skip the first `skip_batches`.
+        pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
+            into device/CUDA pinned memory before returning them.  If your data elements
+            are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
+            see the example below.
+    """
+    if shuffle:
+        dataset = dataset.shuffle(seed)
+    if skip_batches > 0:
+        dataset = dataset.skip(skip_batches * batch_size)
+    if prefetch_size > 0:
+        dataset = IterableWrapper(dataset)
+        dataset = dataset.prefetch(prefetch_size)
+    num_of_hosts = jax.process_count()
+    dataset = split_dataset_by_node(dataset, rank=jax.process_index(), world_size=num_of_hosts)
+    assert batch_size % num_of_hosts == 0, "Batch size must be divisible by the number of hosts."
+    if dataset.n_shards < dataloader_num_workers:
+        dataloader_num_workers = dataset.n_shards
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size //num_of_hosts,
+        drop_last=drop_last,
+        pin_memory=pin_memory,
+        collate_fn=data_collator,
+        num_workers=dataloader_num_workers,
+    )
+    return data_loader
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
+    ordering_and_checkpoint_path = []
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+    for path in glob_checkpoints:
+        if use_mtime:
+            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+        else:
+            regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+            if regex_match is not None and regex_match.groups() is not None:
+                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+def rotate_checkpoints(
+    save_total_limit=None, use_mtime=False, output_dir=None, checkpoint_prefix="checkpoint"
+) -> None:
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(
+        use_mtime=use_mtime, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+    )
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+def to_fp32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def to_bf16(t):
+    return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    max_grad_norm: float
+    def apply_gradients(self, *, grads, to_dtype: to_fp32, **kwargs):
+        """Updates `step`, `params`, `opt_state` and `**kwargs` in return value, clipping the
+        gradients by the maximum grad norm.
+        Note that internally this function calls `.tx.update()` followed by a call
+        to `optax.apply_updates()` to update `params` and `opt_state`.
+        Args:
+          grads: Gradients that have the same pytree structure as `.params`.
+          **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
+        Returns:
+          An updated instance of `self` with `step` incremented by one, `params`
+          and `opt_state` updated by applying `grads`, and additional attributes
+          replaced as specified by `kwargs`.
+        """
+        # clip gradients by global l2 norm
+        casted_max_grad_norm = to_dtype(self.max_grad_norm)
+        g_norm = linear_algebra.global_norm(grads)
+        g_norm = jnp.maximum(casted_max_grad_norm, g_norm)
+        grads = jax.tree_map(lambda t: (t / g_norm) * casted_max_grad_norm, grads)
+        # perform update step in fp32 and subsequently downcast optimizer states if mixed precision training
+        # grads and opt_state in bf16 (need to upcast), params in fp32 (leave as is)
+        updates, new_opt_state = self.tx.update(to_fp32(grads), to_fp32(self.opt_state), self.params)
+        new_params = optax.apply_updates(self.params, updates)
+        return self.replace(
+            step=self.step + 1,
+            params=new_params,
+            opt_state=to_dtype(new_opt_state),
+            **kwargs,
+        )
+    @classmethod
+    def create(cls, *, apply_fn, params, tx, to_dtype: to_fp32, **kwargs):
+        """Creates a new instance with `step=0` and initialized `opt_state`."""
+        # downcast optimizer state to bf16 if mixed-precision training
+        opt_state = tx.init(to_dtype(params))
+        return cls(
+            step=0,
+            apply_fn=apply_fn,
+            params=params,
+            tx=tx,
+            opt_state=opt_state,
+            **kwargs,
+        )
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+    def unreplicate(self):
+        return jax_utils.unreplicate(self)
+    def save_state(self, output_dir, save_total_limit=None, checkpoint_prefix="checkpoint"):
+        step = int(jax.device_get(unreplicate(self.step)))
+        serialized_state = to_bytes(self.unreplicate())
+        output_file = Path(os.path.join(output_dir, f"{checkpoint_prefix}-{step}", "train_state.msgpack"))
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        with output_file.open("wb") as f:
+            f.write(serialized_state)
+        logger.info(f"Flax train state saved in {output_file}")
+        rotate_checkpoints(
+            save_total_limit=save_total_limit, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
+        )
+def save_hf_weights(
+    student_state: TrainState,
+    student_model: FlaxWhisperForConditionalGeneration,
+    processor: WhisperProcessor,
+    output_dir: str,
+    cur_step: int,
+    total_train_steps: int,
+    use_scan: bool = True,
+    checkpoint_prefix: str = "checkpoint",
+) -> None:
+    # always disable scan in the params / model so that we can load from PyTorch directly - this is a no-op if we're not using scan for training
+    student_state_params = unreplicate(student_state.params)
+    student_state_params = student_model.convert_scan_to_unroll(student_state_params)
+    student_params = jax.device_get(student_state_params)
+    student_model.disable_scan()
+    if cur_step != total_train_steps:
+        output_dir = os.path.join(output_dir, f"{checkpoint_prefix}-{cur_step}")
+        os.makedirs(output_dir, exist_ok=True)
+    student_model.save_pretrained(output_dir, params=student_params)
+    processor.save_pretrained(output_dir)
+    # re-enable scan only if required for training
+    if use_scan:
+        student_model.enable_scan()
+def write_train_metric(summary_writer, train_metrics, train_time, step, logging_steps):
+    summary_writer.scalar("train/time", train_time, step)
+    # Check if train_metrics is empty
+    if not train_metrics:
+        print("DEBUG: train_metrics is empty; This is probably a bug that needs fixing.")
+        return  # Early exit if train_metrics is empty to avoid further processing
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        steps_arr = np.arange(0, step, logging_steps)[-len(vals) :]
+        tag = f"train/{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, steps_arr[i])
+def write_eval_metric(summary_writer, eval_metrics, step, prefix="eval"):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"{prefix}/{metric_name}", value, step)
+def write_wandb_metric(wandb_logger, metrics, train_time, step, epoch, prefix="train"):
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    wandb_logger.log(log_metrics, step)
+def write_wandb_pred(
+    wandb_logger, pred_str, label_str, norm_pred_str, norm_label_str, cur_step, prefix="eval", num_lines=200000
+):
+    # pretty name for current step: step 50000 -> step 50k
+    cur_step_pretty = f"{int(cur_step // 1000)}k" if cur_step > 1000 else cur_step
+    # convert str data to a wandb compatible format
+    str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data[:num_lines]
+            )
+        },
+        cur_step,
+    )
+    # log incorrect normalised predictions
+    str_data = np.asarray(str_data)
+    str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
+    # log as a table with the appropriate headers
+    wandb_logger.log(
+        {
+            f"incorrect_predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
+                columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data_incorrect[:num_lines]
+            )
+        },
+        cur_step,
+    )
+def create_learning_rate_fn(
+    num_train_steps: int, lr_scheduler_type: str, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    lr_scheduler_types = ("linear", "constant_with_warmup")
+    if lr_scheduler_type not in lr_scheduler_types:
+        raise ValueError(
+            f"lr_scheduler_type of type {lr_scheduler_type} not supported, choose from {lr_scheduler_types}."
+        )
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate,
+        end_value=0 if lr_scheduler_type == "linear" else learning_rate,
+        transition_steps=num_train_steps - num_warmup_steps,
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    splits=None,
+    text_column_names=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        # we assume that all the datasets we're using derive from the distil-whisper org on the Hub - prepend the org name if necessary
+        for i in range(len(dataset_names)):
+            ds_name = dataset_names[i]
+            dataset_names[i] = f"distil-whisper/{ds_name}" if "/" not in ds_name else ds_name
+        dataset_config_names = dataset_config_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        text_column_names = text_column_names.split("+") if text_column_names is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+    if text_column_names is not None and len(text_column_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(text_column_names)} text column names."
+        )
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+    text_column_names = (
+        text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
+    )
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "text_column_name": text_column_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+def load_multiple_datasets(
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    splits: Optional[Union[List, str]] = None,
+    text_column_names: Optional[List] = None,
+    sampling_rate: Optional[int] = 16000,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: bool = True,
+    seed: int = None,
+    **kwargs,
+) -> IterableDataset:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
+    )
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+    if len(dataset_names_dict) == 1:
+        dataset_dict = dataset_names_dict[0]
+        # we have a single dataset so just return it as is
+        return load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        dataset = load_dataset(
+            dataset_dict["name"],
+            dataset_dict["config"],
+            split=dataset_dict["split"],
+            streaming=streaming,
+            **kwargs,
+        )
+        # resample to specified sampling rate
+        dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
+        dataset = dataset.remove_columns(
+            set(dataset.features.keys()) - {"audio", dataset_dict["text_column_name"], "whisper_transcript"}
+        )
+        all_datasets.append(dataset)
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        interleaved_dataset = concatenate_datasets(all_datasets)
+    return interleaved_dataset
+def get_layers_to_supervise(student_layers: int, teacher_layers: int) -> dict:
+    """Helper function to map the student layer i to the teacher layer j whose output we'd like them to emulate. Used
+    for MSE loss terms in distillation (hidden-states and activations). Student layers are paired with teacher layers
+    in equal increments, e.g. for a 12-layer model distilled to a 3-layer model, student layer 0 emulates teacher layer
+    3 (such that it behaves like the first 4 teacher layers), student layer 1 emulates teacher layer 7, and student layer
+    2 emulates teacher layer 11. This mapping is summarised by the dictionary: {0: 3, 1: 7, 2: 11}, which is precisely
+    the output of this function for the arguments (student_layers=3, teacher_layers=12)."""
+    layer_intervals = np.linspace(teacher_layers // student_layers - 1, teacher_layers - 1, student_layers, dtype=int)
+    layer_intervals[-1] = teacher_layers - 1
+    layer_map = {}
+    for student_layer, teacher_layer in enumerate(layer_intervals):
+        layer_map[student_layer] = teacher_layer
+    return layer_map
+class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
+        computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
+        in transformers, and matches to within 1e-5 abs tolerance.
+        """
+        waveform = torch.from_numpy(waveform).type(torch.float32)
+        window = torch.hann_window(self.n_fft)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec.numpy()
+def main():
+    # 1. Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FlaxSeq2SeqTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your JAX/Flax versions.
+    send_example_telemetry("run_flax_speech_recognition_seq2seq", model_args, data_args, framework="flax")
+    # 2. Define remote logging - do this early so that we get the full traceback on our remote logs
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard:
+        if jax.process_index() == 0:
+            try:
+                from flax.metrics.tensorboard import SummaryWriter
+                summary_writer = SummaryWriter(log_dir=os.path.join(Path(training_args.output_dir), "runs"))
+            except ImportError as ie:
+                has_tensorboard = False
+                logger.warning(
+                    "Unable to display metrics through TensorBoard because some package" f" are not installed: {ie}"
+                )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not"
+            " installed: Please run `pip install tensorboard` to enable."
+        )
+    # Enable wandb only on the master node
+    has_wandb = is_wandb_available()
+    if has_wandb:
+        import wandb as wandb_logger
+        # Set up wandb run
+        if jax.process_index() == 0:
+            wandb_logger.init(
+                project=data_args.wandb_project,
+                name=data_args.wandb_name,
+                job_type=data_args.wandb_job_type,
+                dir=data_args.wandb_dir,
+                save_code=data_args.save_code_to_wandb,
+            )
+    else:
+        logger.warning("Wandb logging requires wandb to be installed. Run `pip install wandb` to enable.")
+    # 3. Setup local logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Set the verbosity to info of the Transformers logger.
+    # We only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    logger.info("Training/evaluation parameters %s", training_args)
+    # Check the output dir is valid
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not"
+            " empty. Use `--overwrite_output_dir` to overcome."
+        )
+    # 4. Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name,
+                token=training_args.hub_token,
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
+        repo = Repository(
+            training_args.output_dir,
+            clone_from=repo_name,
+            token=training_args.hub_token,
+        )
+    if training_args.compilation_cache:
+        cc.initialize_cache(os.path.join(model_args.cache_dir, "jax_cache"))
+    # 5. Load dataset
+    raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    # set seed for determinism
+    set_seed(training_args.seed)
+    if training_args.do_train:
+        raw_datasets["train"] = load_multiple_datasets(
+            data_args.train_dataset_name,
+            data_args.train_dataset_config_name,
+            splits=data_args.train_split_name,
+            streaming=data_args.streaming,
+            dataset_samples=data_args.train_dataset_samples,
+            seed=training_args.seed,
+            cache_dir=data_args.dataset_cache_dir,
+            token=True if model_args.use_auth_token else None,
+        )
+    if training_args.do_eval:
+        dataset_names_dict = convert_dataset_str_to_list(
+            data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
+            (
+                data_args.eval_dataset_config_name
+                if data_args.eval_dataset_config_name
+                else data_args.train_dataset_config_name
+            ),
+            splits=data_args.eval_split_name,
+            text_column_names=data_args.eval_text_column_name,
+        )
+        all_eval_splits = []
+        if len(dataset_names_dict) == 1:
+            # load a single eval set
+            dataset_dict = dataset_names_dict[0]
+            all_eval_splits.append("eval")
+            raw_datasets["eval"] = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                cache_dir=data_args.dataset_cache_dir,
+                token=True if model_args.use_auth_token else None,
+                streaming=data_args.streaming,
+            )
+        else:
+            # load multiple eval sets
+            for dataset_dict in dataset_names_dict:
+                if dataset_dict["name"] == "esb/diagnostic-dataset":
+                    # for the ESB diagnostic dataset, the dataset name is effectively the config
+                    pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
+                else:
+                    pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
+                all_eval_splits.append(pretty_name)
+                raw_datasets[pretty_name] = load_dataset(
+                    dataset_dict["name"],
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    cache_dir=data_args.dataset_cache_dir,
+                    token=True if model_args.use_auth_token else None,
+                    streaming=data_args.streaming,
+                )
+                features = raw_datasets[pretty_name].features.keys()
+                if "text" not in features:
+                    raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
+                        dataset_dict["text_column_name"], "text"
+                    )
+                raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
+                    set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
+                )
+    if not training_args.do_train and not training_args.do_eval:
+        raise ValueError(
+            "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
+        )
+    raw_datasets_train_features = list(raw_datasets["train"].features.keys())
+    if data_args.audio_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--audio_column_name '{data_args.audio_column_name}' not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--audio_column_name` to"
+            " the correct audio column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    if data_args.train_text_column_name not in raw_datasets_train_features:
+        raise ValueError(
+            f"--train_text_column_name {data_args.train_text_column_name} not found in dataset"
+            f" '{data_args.dataset_name}'. Make sure to set `--train_text_column_name` to the"
+            " correct text column - one of"
+            f" {', '.join(raw_datasets_train_features)}."
+        )
+    # 6. Load pretrained model, tokenizer, and feature extractor
+    config = WhisperConfig.from_pretrained(
+        (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(
+        (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = WhisperTokenizerFast.from_pretrained(
+        (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+    )
+    # override timestamp tokens until tokenizer issues are fixed in transformers
+    timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
+    tokenizer.add_tokens(timestamps)
+    config.update(
+        {
+            "activation_dropout": model_args.activation_dropout,
+            "attention_dropout": model_args.attention_dropout,
+            "dropout": model_args.dropout,
+        }
+    )
+    if training_args.precision == "full_mixed":
+        # forward pass, backward pass and optimiser states in bf16
+        dtype = jnp.bfloat16
+        to_dtype = to_bf16
+    elif training_args.precision == "half_mixed" or model_args.dtype == "bfloat16":
+        # forward pass in bf16, backward pass and optimiser states in fp32
+        dtype = jnp.bfloat16
+        to_dtype = to_fp32
+    else:
+        if training_args.precision != "full":
+            raise ValueError(
+                f"`precision` should be one of: `full`, `half_mixed` or `full_mixed`, got {training_args.precision}"
+            )
+        # forward pass, backward pass and optimiser states in fp32
+        dtype = jnp.float32
+        to_dtype = to_fp32
+    student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        subfolder=model_args.subfolder,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+        use_scan=model_args.load_with_scan_weights,
+    )
+    teacher_model, teacher_params = FlaxWhisperForConditionalGeneration.from_pretrained(
+        model_args.teacher_model_name_or_path,
+        # config=config,
+        dtype=dtype,
+        cache_dir=model_args.cache_dir,
+        # revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
+        _do_init=False,
+    )
+    if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
+        raise ValueError(
+            f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
+            f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
+            f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
+        )
+    # enable scan / gradient checkpointing if necessary
+    if training_args.use_scan:
+        student_model.enable_scan()  # to enable scan in the nn.Module
+        student_params = student_model.convert_unroll_to_scan(student_params)  # to convert the unrolled params to scan
+        teacher_model.enable_scan()  # faster compile time (even though we don't train the teacher)
+        teacher_params = teacher_model.convert_unroll_to_scan(teacher_params)
+    if training_args.gradient_checkpointing:
+        student_model.enable_gradient_checkpointing()  # to enable checkpointing in the nn.Module, there is no change to the params structure
+        teacher_model.enable_gradient_checkpointing()
+    if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
+        # We need to set the language and task ids for previously multilingual checkpoints - for now we hardcode this to Norwegian
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=False)
+        student_model.generation_config.update(
+            **{
+                "language": "<|no|>",
+                "task": "transcribe",
+            }
+        )
+    # 7. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
+    # so we just need to set the correct target sampling rate.
+    raw_datasets = raw_datasets.cast_column(
+        data_args.audio_column_name,
+        datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
+    )
+    # 8. Preprocessing the datasets.
+    # We need to read the audio files as arrays and tokenize the targets.
+    max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
+    min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
+    max_label_length = (
+        data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
+    )
+    audio_column_name = data_args.audio_column_name
+    num_workers = data_args.preprocessing_num_workers
+    dataloader_num_workers = training_args.dataloader_num_workers
+    dataloader_prefetch_size = data_args.prefetch_size
+    train_text_column_name = data_args.train_text_column_name
+    eval_text_column_name = "text"
+    model_input_name = feature_extractor.model_input_names[0]
+    normalizer = BasicTextNormalizer(tokenizer.english_spelling_normalizer)
+    wer_threshold = data_args.wer_threshold
+    round_timestamps = data_args.round_timestamps
+    if training_args.do_train and data_args.max_train_samples is not None:
+        raw_datasets["train"] = (
+            raw_datasets["train"].take(data_args.max_train_samples)
+            if data_args.streaming
+            else raw_datasets["train"].select(range(data_args.max_train_samples))
+        )
+    if training_args.do_eval and data_args.max_eval_samples is not None:
+        for eval_split in all_eval_splits:
+            raw_datasets[eval_split] = (
+                raw_datasets[eval_split].take(data_args.max_eval_samples)
+                if data_args.streaming
+                else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
+            )
+    # 10.3: filter training data based on WER threshold -> this is KEY to good distillation performance
+    def is_wer_in_range(ground_truth, whisper_transcript):
+        norm_ground_truth = normalizer(ground_truth)
+        if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
+            # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
+            return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
+            norm_whisper_transcript = normalizer(whisper_transcript)
+            wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
+            return wer < wer_threshold
+        else:
+            # filter automatically since we cant know WER
+            return False
+    filter_by_wer_threshold = partial(
+        raw_datasets["train"].filter,
+        function=is_wer_in_range,
+        input_columns=[eval_text_column_name, train_text_column_name],
+    )
+    if wer_threshold is not None:
+        raw_datasets["train"] = (
+            filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
+            if not data_args.streaming
+            else filter_by_wer_threshold()
+        )
+    def has_timestamp_tokens(input_str):
+        """
+        Identify whether the input string contains timestamp tokens, of the form <|0.00|>, by searching for
+        pairs of left and right-angle brackets.
+        """
+        return bool(re.search("\<[^\>]*\>", input_str))
+    def round_timestamp_tokens(input_str: str, ndigits: int = 1):
+        timestamps = re.findall("\<[^\>]*\>", input_str, re.DOTALL)
+        for token in timestamps:
+            # extract time digits from timestamp token, e.g. <|6.24|> to 6.24
+            time_digit = token[2:-2]
+            # round to specified number of digits, e.g. 6.24 to 6.2
+            time_digit = round(float(time_digit), ndigits=ndigits)
+            # replace in original string with the same precision, e.g. <|6.24|> to <|6.20|>
+            input_str = input_str.replace(token, "<|{:.2f}|>".format(time_digit))
+        return input_str
+    def prepare_train_dataset(batch):
+        # process audio input
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process text targets
+        input_str = batch[train_text_column_name]
+        # prompt & timestamp processing: for now, we only do one or the other
+        if input_str.startswith("<|startoftranscript|>") or input_str.startswith("<|startofprev|>"):
+            # prompted target text already has special ids added, so don't add them here
+            batch["labels"] = tokenizer(input_str, add_special_tokens=False).input_ids
+            return batch
+        has_timestamps = has_timestamp_tokens(input_str)
+        if has_timestamps:
+            predict_timestamps = bool(np.random.binomial(1, data_args.timestamp_probability))
+            if not predict_timestamps:
+                # filter timestamp token ids if not part of the prediction task
+                input_str = tokenizer._filter_timestamp_ids(input_str)
+            elif round_timestamps:
+                input_str = round_timestamp_tokens(input_str)
+        else:
+            predict_timestamps = False
+        tokenizer.set_prefix_tokens(language="Norwegian", task="transcribe", predict_timestamps=predict_timestamps)
+        input_ids = tokenizer(input_str).input_ids
+        batch["labels"] = input_ids
+        return batch
+    def prepare_eval_dataset(batch):
+        # process audio
+        sample = batch[audio_column_name]
+        inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
+        # process audio length
+        batch[model_input_name] = inputs.get(model_input_name)[0]
+        batch["input_length"] = len(sample["array"])
+        # process targets
+        input_str = batch[eval_text_column_name]
+        batch["labels"] = tokenizer(input_str).input_ids
+        return batch
+    vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
+    if training_args.do_train:
+        map_fn_train = partial(
+            raw_datasets["train"].map, function=prepare_train_dataset, remove_columns=raw_datasets_train_features
+        )
+        vectorized_datasets["train"] = (
+            map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
+            if not data_args.streaming
+            else map_fn_train()
+        )
+    if training_args.do_eval:
+        for eval_split in all_eval_splits:
+            raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
+            map_fn_eval = partial(
+                raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
+            )
+            vectorized_datasets[eval_split] = (
+                map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
+                if not data_args.streaming
+                else map_fn_eval()
+            )
+    # filter training data with inputs longer than max_input_length
+    def is_audio_in_length_range(length):
+        return min_input_length < length < max_input_length
+    filter_by_audio_fn = partial(
+        vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
+    )
+    vectorized_datasets = (
+        filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
+        if not data_args.streaming
+        else filter_by_audio_fn()
+    )
+    # filter training data with labels longer than max_label_length
+    def is_labels_in_length_range(labels):
+        return 0 < len(labels) < max_label_length
+    filter_by_labels_fn = partial(
+        vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
+    )
+    vectorized_datasets = (
+        filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
+        if not data_args.streaming
+        else filter_by_labels_fn()
+    )
+    # for large datasets it is advised to run the preprocessing on a
+    # single machine first with `args.preprocessing_only` since there will mostly likely
+    # be a timeout when running the script in distributed mode.
+    # In a second step `args.preprocessing_only` can then be set to `False` to load the
+    # cached dataset
+    if data_args.preprocessing_only:
+        cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
+        logger.info(f"Data preprocessing finished. Files cached at {cache}.")
+        return
+    # 8. Load Metric
+    metric = evaluate.load("wer")
+    # convention is that we space all punctuation *except* apostrophes
+    all_punctuation = list(string.punctuation.replace("'", ""))
+    return_timestamps = data_args.return_timestamps if data_args.timestamp_probability > 0 else False
+    def compute_metrics(preds, labels):
+        # replace padded labels by the padding token
+        for idx in range(len(labels)):
+            labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
+        pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
+        # we do not want to group tokens when computing the metrics
+        label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # space punctuation for orthographic WER (c.f. ESB paper https://arxiv.org/abs/2210.13352)
+        spaced_pred_str = [
+            pred_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(pred_str))
+        ]
+        spaced_label_str = [
+            label_str[i].replace(punctuation, f" {punctuation} ")
+            for punctuation in all_punctuation
+            for i in range(len(label_str))
+        ]
+        wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
+        norm_pred_str, norm_label_str = [], []
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nospeech|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nospeech|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nospeech|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nospeech|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nospeech|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
+        wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
+        return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
+    # 9. Save feature extractor, tokenizer, config and generation config
+    feature_extractor.save_pretrained(training_args.output_dir)
+    tokenizer.save_pretrained(training_args.output_dir)
+    config.save_pretrained(training_args.output_dir)
+    student_model.generation_config.save_pretrained(
+        training_args.output_dir
+    )  # generation config stays bound to model to make it easy to jit
+    processor = WhisperProcessor.from_pretrained(training_args.output_dir)
+    data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
+        processor=processor,
+        decoder_start_token_id=student_model.config.decoder_start_token_id,  # <|startoftranscript|>
+        decoder_prev_token_id=tokenizer.all_special_ids[-3],  # <|startofprev|>
+        input_padding="longest",
+        target_padding="max_length",
+        max_target_length=max_label_length,
+    )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constants
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
+    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
+    eval_batch_size = per_device_eval_batch_size * jax.device_count()
+    if not data_args.streaming and training_args.max_steps < 0:
+        num_epochs = int(training_args.num_train_epochs)
+        steps_per_epoch = len(vectorized_datasets["train"]) // train_batch_size
+        total_train_steps = steps_per_epoch * num_epochs
+    elif training_args.max_steps > 0:
+        logger.info("max_steps is given, it will override any value given in num_train_epochs")
+        total_train_steps = int(training_args.max_steps)
+        # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+        num_epochs = sys.maxsize
+        steps_per_epoch = total_train_steps
+    else:
+        raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
+    if training_args.eval_steps is None:
+        logger.info(
+            f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
+        )
+        eval_steps = steps_per_epoch
+    else:
+        eval_steps = training_args.eval_steps
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        total_train_steps * gradient_accumulation_steps,
+        training_args.lr_scheduler_type,
+        training_args.warmup_steps * gradient_accumulation_steps,
+        training_args.learning_rate,
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        # find out all LayerNorm parameters
+        layer_norm_candidates = [
+            "layer_norm",
+            "self_attn_layer_norm",
+            "final_layer_norm",
+            "encoder_attn_layer_norm",
+        ]
+        layer_norm_named_params = {
+            layer[-2:]
+            for layer_norm_name in layer_norm_candidates
+            for layer in flat_params.keys()
+            if layer_norm_name in "".join(layer).lower()
+        }
+        flat_mask = {path: path[-1] != "bias" and path[-2:] not in layer_norm_named_params for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    if gradient_accumulation_steps > 1:
+        # accumulate gradients and apply once every k steps
+        adamw = optax.MultiSteps(adamw, every_k_schedule=gradient_accumulation_steps)
+    share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
+    encoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.encoder_layers, teacher_model.config.encoder_layers
+    )
+    decoder_layer_mapping = get_layers_to_supervise(
+        student_model.config.decoder_layers, teacher_model.config.decoder_layers
+    )
+    # Setup train state
+    student_state = TrainState.create(
+        apply_fn=student_model.decode if share_hidden_states else student_model.__call__,
+        params=student_params,
+        tx=adamw,
+        to_dtype=to_dtype,
+        dropout_rng=dropout_rng,
+        max_grad_norm=training_args.max_grad_norm,
+    )
+    if training_args.resume_from_checkpoint is not None:
+        if os.path.isfile(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")):
+            logger.info(
+                f"Checkpoint detected, resuming training at {training_args.resume_from_checkpoint}. To avoid "
+                "this behavior, omit the resume_from_checkpoint argument."
+            )
+            with Path(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")).open("rb") as f:
+                student_state = from_bytes(student_state, f.read())
+        else:
+            logger.warning(
+                f"Checkpoint {training_args.resume_from_checkpoint} not detected, training from scratch. Ensure "
+                f"you pass the path to a folder with a valid checkpoint for your model."
+            )
+    def cross_entropy_loss(logits, labels):
+        vocab_size = logits.shape[-1]
+        # optax onehot always returns a float32 device array, need to downcast if performing mixed precision training
+        onehot_targets = to_dtype(onehot(labels, vocab_size))
+        loss = optax.softmax_cross_entropy(logits, onehot_targets)
+        # ignore padded tokens from loss, i.e. where labels are not set to -100
+        padding = labels >= 0
+        loss = loss * padding
+        loss = loss.sum()
+        num_labels = padding.sum()
+        return loss, num_labels
+    # temperature smoothed kl-divergence
+    def kl_divergence(target_distribution, log_predicted_distribution, labels, eps=1e-20):
+        divergence = -target_distribution * (log_predicted_distribution - jnp.log(target_distribution + eps))
+        # ignore padded tokens from divergence, i.e. where labels are not set to -100
+        padding_mask = labels >= 0
+        padding_mask = jnp.expand_dims(padding_mask, axis=-1)
+        divergence = (divergence * padding_mask).sum()
+        return to_dtype(divergence)  # respect the dtype of the backprop
+    def mean_square_error_loss(student_outputs, teacher_outputs):
+        mse = dtype(0.0)
+        # tie encoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.encoder_hidden_states[0] - student_outputs.encoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in encoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.encoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.encoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.encoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.encoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+        # tie decoder embeddings
+        mse += jnp.mean(
+            jnp.square(teacher_outputs.decoder_hidden_states[0] - student_outputs.decoder_hidden_states[0])
+        )
+        for student_layer_id, teacher_layer_id in decoder_layer_mapping.items():
+            # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
+            student_hidden_state = student_outputs.decoder_hidden_states[student_layer_id + 1]
+            teacher_hidden_state = teacher_outputs.decoder_hidden_states[teacher_layer_id + 1]
+            mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
+            # student_attention = student_outputs.decoder_attentions[student_layer_id]
+            # teacher_attention = teacher_outputs.decoder_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
+            # student_cross_attention = student_outputs.cross_attentions[student_layer_id]
+            # teacher_cross_attention = teacher_outputs.cross_attentions[teacher_layer_id]
+            # mse += jnp.mean(jnp.square(student_cross_attention - teacher_cross_attention))
+        return to_dtype(mse)  # respect the dtype of the backprop
+    # Define gradient update step fn
+    def train_step(
+        student_state,
+        teacher_params,
+        batch,
+        freeze_encoder,
+        share_hidden_states,
+        temperature=2.0,
+    ):
+        dropout_rng, new_dropout_rng = jax.random.split(student_state.dropout_rng)
+        def compute_loss(student_params):
+            labels = batch.pop("labels")
+            output_hidden_states = not share_hidden_states and training_args.mse_weight > 0.0
+            teacher_outputs = teacher_model(
+                **batch,
+                params=teacher_params,
+                freeze_encoder=True,
+                output_hidden_states=output_hidden_states,
+                train=False,
+            )
+            if share_hidden_states:
+                # if the student and teacher share the same frozen encoder then we don't have to recompute the
+                # encoder hidden-states for the student model, we can just re-use from the teacher
+                encoder_hidden_states = jax.lax.stop_gradient(teacher_outputs.encoder_last_hidden_state)
+                encoder_outputs = FlaxBaseModelOutput(last_hidden_state=encoder_hidden_states)
+                student_outputs = student_state.apply_fn(
+                    decoder_input_ids=batch["decoder_input_ids"],
+                    encoder_outputs=encoder_outputs,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    train=True,
+                )
+            else:
+                # do the full forward pass for the student model (encoder + decoder)
+                student_outputs = student_state.apply_fn(
+                    **batch,
+                    params=student_params,
+                    dropout_rng=dropout_rng,
+                    freeze_encoder=freeze_encoder,
+                    output_hidden_states=output_hidden_states,
+                    train=True,
+                )
+            # CE (data) loss
+            ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+            # rescale by temperature to ensure gradients scale correctly
+            teacher_distribution = jax.nn.softmax(teacher_outputs.logits / temperature, axis=-1)
+            # ensure no information flow backwards through teacher
+            teacher_distribution = jax.lax.stop_gradient(teacher_distribution)
+            # log softmax of student predictions for numerical stability
+            student_distribution = jax.nn.log_softmax(student_outputs.logits / temperature, axis=-1)
+            # KL-divergence loss (scaled by temperature)
+            kl_loss = kl_divergence(teacher_distribution, student_distribution, labels) * temperature**2
+            # MSE loss between enc-dec hidden-states and attentions
+            mse_loss = (
+                mean_square_error_loss(student_outputs, teacher_outputs)
+                if output_hidden_states
+                else jnp.zeros_like(kl_loss)
+            )
+            # use DistilBart formulation - only tune the MSE weight and take remaining HPs from DistilBERT
+            ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+            loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+            return loss, (
+                ce_loss,
+                kl_loss,
+                mse_loss,
+                num_labels,
+            )
+        grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
+        (loss, (ce_loss, kl_loss, mse_loss, num_labels)), grad = grad_fn(to_dtype(student_state.params))
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # true grad = total grad / total samples
+        grad = jax.lax.psum(grad, "batch")
+        grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
+        new_state = student_state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng, to_dtype=to_dtype)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {
+            "loss": loss,
+            "learning_rate": linear_decay_lr_schedule_fn(student_state.step),
+            "ce_loss": ce_loss,
+            "kl_loss": kl_loss,
+            "mse_loss": mse_loss,
+        }
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(student_params, teacher_params, batch):
+        labels = batch.pop("labels")
+        output_hidden_states = not share_hidden_states and training_args.mse_weight > 0
+        student_outputs = student_model(
+            **batch,
+            params=student_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        student_distribution = jax.nn.log_softmax(student_outputs.logits, axis=-1)
+        ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
+        teacher_outputs = teacher_model(
+            **batch,
+            params=teacher_params,
+            output_hidden_states=output_hidden_states,
+            train=False,
+        )
+        teacher_distribution = jax.nn.softmax(teacher_outputs.logits, axis=-1)
+        # temperature is always 1 for eval
+        kl_loss = kl_divergence(teacher_distribution, student_distribution, labels)
+        mse_loss = (
+            mean_square_error_loss(student_outputs, teacher_outputs)
+            if output_hidden_states
+            else jnp.zeros_like(kl_loss)
+        )
+        ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
+        loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
+        # true loss = total loss / total samples
+        loss = jax.lax.psum(loss, "batch")
+        num_labels = jax.lax.psum(num_labels, "batch")
+        loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
+        # CE/KL/MSE losses for logging
+        ce_loss = jax.lax.psum(ce_loss, "batch")
+        ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
+        kl_loss = jax.lax.psum(kl_loss, "batch")
+        kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
+        mse_loss = jax.lax.psum(mse_loss, "batch")
+        mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
+        metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss, "mse_loss": mse_loss}
+        return metrics
+    # Define generation function
+    num_beams = (
+        training_args.generation_num_beams
+        if training_args.generation_num_beams is not None
+        else student_model.config.num_beams
+    )
+    # forcing the language and task tokens helps the model in its generations
+    gen_kwargs = {
+        "max_length": max_label_length,
+        "num_beams": num_beams,
+        "language": "<|en|>",
+        "task": "transcribe",
+        "return_timestamps": return_timestamps,
+    }
+    def generate_step(student_params, batch):
+        output_ids = student_model.generate(
+            batch[model_input_name],
+            attention_mask=batch.get("attention_mask"),
+            params=student_params,
+            **gen_kwargs,
+        )
+        return output_ids.sequences
+    # Replicate the train state on each device
+    student_state = student_state.replicate()
+    # Replicate the teacher params on each device
+    teacher_params = jax_utils.replicate(teacher_params)
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(
+        train_step,
+        "batch",
+        in_axes=(0, 0, 0, None, None, None),
+        donate_argnums=(0,),
+        static_broadcasted_argnums=(
+            3,
+            4,
+        ),
+    )
+    p_eval_step = jax.pmap(eval_step, "batch")
+    p_generate_step = jax.pmap(generate_step, "batch")
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
+    logger.info("  Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
+    logger.info("  Gradient accumulation steps =" f" {gradient_accumulation_steps}")
+    logger.info(
+        f"  Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
+    )
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    # ======================== Training ================================
+    train_time = 0
+    train_start = time.time()
+    train_metrics = []
+    batches_to_skip = jax.device_get(unreplicate(student_state.step))
+    cur_step = int(batches_to_skip)  # will be zero if starting from scratch
+    epochs_trained = batches_to_skip // steps_per_epoch
+    steps_trained_progress_bar = tqdm(range(total_train_steps), desc="Train steps ... ", position=0)
+    steps_trained_progress_bar.update(batches_to_skip)
+    continue_training = True
+    minibatch_steps = 0
+    if batches_to_skip > 0:
+        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+        logger.info(f"  Continuing training from epoch {epochs_trained}")
+        logger.info(f"  Continuing training from global step {batches_to_skip}")
+    # Generate a training data loader by shuffling sampling indices from the train dataset
+    train_loader = get_data_loader(
+        training_args.seed,
+        vectorized_datasets["train"],
+        batch_size=train_batch_size,
+        data_collator=data_collator,
+        dataloader_num_workers=dataloader_num_workers,
+        skip_batches=batches_to_skip,
+        prefetch_size=dataloader_prefetch_size,
+    )
+    for epoch in range(epochs_trained, num_epochs):
+        if hasattr(train_loader, "dataset") and isinstance(train_loader.dataset, IterableDataset):
+            train_loader.dataset.set_epoch(epoch)
+        for batch in train_loader:
+            minibatch_steps += 1
+            update_step = minibatch_steps == gradient_accumulation_steps
+            if update_step:
+                steps_trained_progress_bar.update(1)
+                cur_step += 1
+                minibatch_steps = 0
+            batch = shard(batch.data)
+            student_state, train_metric = p_train_step(
+                student_state,
+                teacher_params,
+                batch,
+                training_args.freeze_encoder,
+                share_hidden_states,
+                training_args.temperature,
+            )
+            if cur_step % training_args.logging_steps == 0 and update_step:
+                train_metrics.append(train_metric)
+                train_metric_to_write = unreplicate(train_metric)
+                steps_trained_progress_bar.write(
+                    f"Step... ({cur_step} / {total_train_steps} | Loss:"
+                    f" {train_metric_to_write['loss']}, Learning Rate:"
+                    f" {train_metric_to_write['learning_rate']})"
+                )
+                if has_wandb and jax.process_index() == 0:
+                    write_wandb_metric(
+                        wandb_logger,
+                        train_metric_to_write,
+                        train_time + time.time() - train_start,
+                        cur_step,
+                        epoch,
+                        prefix="train",
+                    )
+            # save checkpoint and weights after each save_steps and at the end of training
+            if (cur_step % training_args.save_steps == 0 and update_step) or cur_step == total_train_steps:
+                if jax.process_index() == 0:
+                    save_hf_weights(
+                        student_state,
+                        student_model,
+                        processor,
+                        training_args.output_dir,
+                        cur_step,
+                        total_train_steps,
+                        use_scan=training_args.use_scan,
+                    )
+                    if training_args.save_train_state:
+                        student_state.save_state(
+                            training_args.output_dir, save_total_limit=training_args.save_total_limit
+                        )
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(
+                            commit_message=f"Saving train state of step {cur_step}",
+                            blocking=False,
+                        )
+            if training_args.do_eval and (
+                (cur_step % eval_steps == 0 and update_step) or cur_step == total_train_steps
+            ):
+                train_time += time.time() - train_start
+                # ======================== Evaluating ==============================
+                for eval_split in all_eval_splits:
+                    eval_metrics = []
+                    eval_preds = []
+                    eval_labels = []
+                    eval_start = time.time()
+                    eval_loader = get_data_loader(
+                        training_args.seed,
+                        vectorized_datasets[eval_split],
+                        batch_size=eval_batch_size,
+                        data_collator=data_collator,
+                        shuffle=False,
+                        drop_last=False,
+                        dataloader_num_workers=dataloader_num_workers,
+                    )
+                    for batch in tqdm(eval_loader, desc=f"Evaluating {eval_split}...", position=2):
+                        # Model forward
+                        labels = batch["labels"]
+                        metrics = pad_shard_unpad(
+                            p_eval_step,
+                            static_argnums=(
+                                0,
+                                1,
+                            ),
+                            static_return=True,
+                        )(
+                            student_state.params,
+                            teacher_params,
+                            batch.data,
+                            min_device_batch=per_device_eval_batch_size,
+                        )
+                        eval_metrics.append(metrics)
+                        # generation
+                        if training_args.predict_with_generate:
+                            generated_ids = pad_shard_unpad(p_generate_step)(
+                                student_state.params, batch.data, min_device_batch=per_device_eval_batch_size
+                            )
+                            eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
+                            eval_labels.extend(labels)
+                    eval_time = time.time() - eval_start
+                    # normalize eval metrics
+                    eval_metrics = get_metrics(eval_metrics)
+                    eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
+                    # compute WER metric
+                    wer_desc = ""
+                    if training_args.predict_with_generate:
+                        wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
+                            eval_preds, eval_labels
+                        )
+                        eval_metrics.update(wer_metric)
+                        wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
+                    # Print metrics and update progress bar
+                    steps_trained_progress_bar.write(
+                        f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
+                        f" {wer_desc})"
+                    )
+                    if has_tensorboard and jax.process_index() == 0:
+                        write_eval_metric(
+                            summary_writer,
+                            eval_metrics,
+                            cur_step,
+                            prefix=eval_split,
+                        )
+                    if has_wandb and jax.process_index() == 0:
+                        write_wandb_metric(wandb_logger, eval_metrics, eval_time, cur_step, epoch, prefix=eval_split)
+                        if training_args.predict_with_generate:
+                            write_wandb_pred(
+                                wandb_logger,
+                                pred_str,
+                                label_str,
+                                norm_pred_str,
+                                norm_label_str,
+                                cur_step,
+                                prefix=eval_split,
+                            )
+                if has_tensorboard and jax.process_index() == 0:
+                    # we'll only log to tensorboard every eval steps
+                    write_train_metric(
+                        summary_writer,
+                        train_metrics,
+                        train_time,
+                        cur_step,
+                        training_args.logging_steps,
+                    )
+                # flush the train metrics
+                train_start = time.time()
+                train_metrics = []
+            # break condition
+            if cur_step == total_train_steps:
+                continue_training = False
+                break
+        if not continue_training:
+            break
+if __name__ == "__main__":
+    main()

run_large_training.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env bash
+TOKENIZERS_PARALLELISM=false python3 run_distillation_nodes.py \
+  --model_name_or_path "./nb-distil-large-init" \
+  --teacher_model_name_or_path "NbAiLab/nb-whisper-large" \
+  --train_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_large" \
+  --train_dataset_config_name "no" \
+  --train_split_name "train" \
+  --eval_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_large" \
+  --eval_dataset_config_name "no" \
+  --eval_split_name "validation_norwegian_fleurs" \
+  --eval_steps 500 \
+  --save_steps 1000 \
+  --warmup_steps 1000 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "linear" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 100000 \
+  --wer_threshold 10 \
+  --per_device_train_batch_size 32\
+  --per_device_eval_batch_size 32 \
+  --dataloader_num_workers 32 \
+  --dtype "bfloat16" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --streaming \
+  --use_auth_token \
+  --report_to "wandb" \
+  --wandb_project "nb-distil-whisper-large-flax2" \
+  --hub_model_id "NbAiLab/nb-distil-whisper-large-flax2" \
+  --push_to_hub

run_large_training_debug.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env bash
+TOKENIZERS_PARALLELISM=false python3 run_distillation_debug.py \
+  --model_name_or_path "./nb-distil-large-init" \
+  --teacher_model_name_or_path "NbAiLab/nb-whisper-large" \
+  --train_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_compact8_large" \
+  --train_dataset_config_name "no" \
+  --train_split_name "train" \
+  --eval_dataset_name "NbAiLab/annotated_distil_raw_ncc_speech_v7_compact8_large" \
+  --eval_dataset_config_name "no" \
+  --eval_split_name "validation_norwegian_fleurs" \
+  --eval_steps 5000 \
+  --save_steps 5000 \
+  --warmup_steps 500 \
+  --learning_rate 0.0001 \
+  --lr_scheduler_type "linear" \
+  --logging_steps 25 \
+  --save_total_limit 1 \
+  --max_steps 100000 \
+  --wer_threshold 10 \
+  --per_device_train_batch_size 64 \
+  --per_device_eval_batch_size 64 \
+  --dataloader_num_workers 16 \
+  --dtype "bfloat16" \
+  --output_dir "./" \
+  --do_train \
+  --do_eval \
+  --use_scan \
+  --gradient_checkpointing \
+  --overwrite_output_dir \
+  --predict_with_generate \
+  --freeze_encoder \
+  --streaming \
+  --use_auth_token \
+  --report_to "wandb" \
+  --wandb_project "nb-distil-whisper-large-test2" \
+  --hub_model_id "NbAiLab/nb-distil-whisper-large-flax1-no" \
+  --push_to_hub

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "additional_special_tokens": [
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|yue|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nospeech|>",
+    "<|notimestamps|>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff