{ "architectures": [ "ASTModel" ], "attention_probs_dropout_prob": 0.0, "auto_map": { "AutoConfig": "configuration_audio_spectrogram_transformer.ASTConfig", "AutoModel": "modeling_audio_spectrogram_transformer.ASTModel", "AutoModelForAudioClassification": "modeling_audio_spectrogram_transformer.ASTForAudioClassification" }, "frequency_patch_size": 128, "frequency_stride": 128, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "Speech", "1": "Male speech, man speaking", "2": "Female speech, woman speaking", "3": "Child speech, kid speaking", "4": "Conversation", "5": "Narration, monologue", "6": "Babbling", "7": "Speech synthesizer", "8": "Shout", "9": "Bellow", "10": "Whoop", "11": "Yell", "12": "Battle cry", "13": "Children shouting", "14": "Screaming", "15": "Whispering", "16": "Laughter", "17": "Baby laughter", "18": "Giggle", "19": "Snicker", "20": "Belly laugh", "21": "Chuckle, chortle", "22": "Crying, sobbing", "23": "Baby cry, infant cry", "24": "Whimper", "25": "Wail, moan", "26": "Sigh", "27": "Singing", "28": "Choir", "29": "Yodeling", "30": "Chant", "31": "Mantra", "32": "Male singing", "33": "Female singing", "34": "Child singing", "35": "Synthetic singing", "36": "Rapping", "37": "Humming", "38": "Groan", "39": "Grunt", "40": "Whistling", "41": "Breathing", "42": "Wheeze", "43": "Snoring", "44": "Gasp", "45": "Pant", "46": "Snort", "47": "Cough", "48": "Throat clearing", "49": "Sneeze", "50": "Sniff", "51": "Run", "52": "Shuffle", "53": "Walk, footsteps", "54": "Chewing, mastication", "55": "Biting", "56": "Gargling", "57": "Stomach rumble", "58": "Burping, eructation", "59": "Hiccup", "60": "Fart", "61": "Hands", "62": "Finger snapping", "63": "Clapping", "64": "Heart sounds, heartbeat", "65": "Heart murmur", "66": "Cheering", "67": "Applause", "68": "Chatter", "69": "Crowd", "70": "Hubbub, speech noise, speech babble", "71": "Children playing", "72": "Animal", "73": "Domestic animals, pets", "74": "Dog", "75": "Bark", "76": "Yip", "77": "Howl", "78": "Bow-wow", "79": "Growling", "80": "Whimper (dog)", "81": "Cat", "82": "Purr", "83": "Meow", "84": "Hiss", "85": "Caterwaul", "86": "Livestock, farm animals, working animals", "87": "Horse", "88": "Clip-clop", "89": "Neigh, whinny", "90": "Cattle, bovinae", "91": "Moo", "92": "Cowbell", "93": "Pig", "94": "Oink", "95": "Goat", "96": "Bleat", "97": "Sheep", "98": "Fowl", "99": "Chicken, rooster", "100": "Cluck", "101": "Crowing, cock-a-doodle-doo", "102": "Turkey", "103": "Gobble", "104": "Duck", "105": "Quack", "106": "Goose", "107": "Honk", "108": "Wild animals", "109": "Roaring cats (lions, tigers)", "110": "Roar", "111": "Bird", "112": "Bird vocalization, bird call, bird song", "113": "Chirp, tweet", "114": "Squawk", "115": "Pigeon, dove", "116": "Coo", "117": "Crow", "118": "Caw", "119": "Owl", "120": "Hoot", "121": "Bird flight, flapping wings", "122": "Canidae, dogs, wolves", "123": "Rodents, rats, mice", "124": "Mouse", "125": "Patter", "126": "Insect", "127": "Cricket", "128": "Mosquito", "129": "Fly, housefly", "130": "Buzz", "131": "Bee, wasp, etc.", "132": "Frog", "133": "Croak", "134": "Snake", "135": "Rattle", "136": "Whale vocalization", "137": "Music", "138": "Musical instrument", "139": "Plucked string instrument", "140": "Guitar", "141": "Electric guitar", "142": "Bass guitar", "143": "Acoustic guitar", "144": "Steel guitar, slide guitar", "145": "Tapping (guitar technique)", "146": "Strum", "147": "Banjo", "148": "Sitar", "149": "Mandolin", "150": "Zither", "151": "Ukulele", "152": "Keyboard (musical)", "153": "Piano", "154": "Electric piano", "155": "Organ", "156": "Electronic organ", "157": "Hammond organ", "158": "Synthesizer", "159": "Sampler", "160": "Harpsichord", "161": "Percussion", "162": "Drum kit", "163": "Drum machine", "164": "Drum", "165": "Snare drum", "166": "Rimshot", "167": "Drum roll", "168": "Bass drum", "169": "Timpani", "170": "Tabla", "171": "Cymbal", "172": "Hi-hat", "173": "Wood block", "174": "Tambourine", "175": "Rattle (instrument)", "176": "Maraca", "177": "Gong", "178": "Tubular bells", "179": "Mallet percussion", "180": "Marimba, xylophone", "181": "Glockenspiel", "182": "Vibraphone", "183": "Steelpan", "184": "Orchestra", "185": "Brass instrument", "186": "French horn", "187": "Trumpet", "188": "Trombone", "189": "Bowed string instrument", "190": "String section", "191": "Violin, fiddle", "192": "Pizzicato", "193": "Cello", "194": "Double bass", "195": "Wind instrument, woodwind instrument", "196": "Flute", "197": "Saxophone", "198": "Clarinet", "199": "Harp", "200": "Bell", "201": "Church bell", "202": "Jingle bell", "203": "Bicycle bell", "204": "Tuning fork", "205": "Chime", "206": "Wind chime", "207": "Change ringing (campanology)", "208": "Harmonica", "209": "Accordion", "210": "Bagpipes", "211": "Didgeridoo", "212": "Shofar", "213": "Theremin", "214": "Singing bowl", "215": "Scratching (performance technique)", "216": "Pop music", "217": "Hip hop music", "218": "Beatboxing", "219": "Rock music", "220": "Heavy metal", "221": "Punk rock", "222": "Grunge", "223": "Progressive rock", "224": "Rock and roll", "225": "Psychedelic rock", "226": "Rhythm and blues", "227": "Soul music", "228": "Reggae", "229": "Country", "230": "Swing music", "231": "Bluegrass", "232": "Funk", "233": "Folk music", "234": "Middle Eastern music", "235": "Jazz", "236": "Disco", "237": "Classical music", "238": "Opera", "239": "Electronic music", "240": "House music", "241": "Techno", "242": "Dubstep", "243": "Drum and bass", "244": "Electronica", "245": "Electronic dance music", "246": "Ambient music", "247": "Trance music", "248": "Music of Latin America", "249": "Salsa music", "250": "Flamenco", "251": "Blues", "252": "Music for children", "253": "New-age music", "254": "Vocal music", "255": "A capella", "256": "Music of Africa", "257": "Afrobeat", "258": "Christian music", "259": "Gospel music", "260": "Music of Asia", "261": "Carnatic music", "262": "Music of Bollywood", "263": "Ska", "264": "Traditional music", "265": "Independent music", "266": "Song", "267": "Background music", "268": "Theme music", "269": "Jingle (music)", "270": "Soundtrack music", "271": "Lullaby", "272": "Video game music", "273": "Christmas music", "274": "Dance music", "275": "Wedding music", "276": "Happy music", "277": "Funny music", "278": "Sad music", "279": "Tender music", "280": "Exciting music", "281": "Angry music", "282": "Scary music", "283": "Wind", "284": "Rustling leaves", "285": "Wind noise (microphone)", "286": "Thunderstorm", "287": "Thunder", "288": "Water", "289": "Rain", "290": "Raindrop", "291": "Rain on surface", "292": "Stream", "293": "Waterfall", "294": "Ocean", "295": "Waves, surf", "296": "Steam", "297": "Gurgling", "298": "Fire", "299": "Crackle", "300": "Vehicle", "301": "Boat, Water vehicle", "302": "Sailboat, sailing ship", "303": "Rowboat, canoe, kayak", "304": "Motorboat, speedboat", "305": "Ship", "306": "Motor vehicle (road)", "307": "Car", "308": "Vehicle horn, car horn, honking", "309": "Toot", "310": "Car alarm", "311": "Power windows, electric windows", "312": "Skidding", "313": "Tire squeal", "314": "Car passing by", "315": "Race car, auto racing", "316": "Truck", "317": "Air brake", "318": "Air horn, truck horn", "319": "Reversing beeps", "320": "Ice cream truck, ice cream van", "321": "Bus", "322": "Emergency vehicle", "323": "Police car (siren)", "324": "Ambulance (siren)", "325": "Fire engine, fire truck (siren)", "326": "Motorcycle", "327": "Traffic noise, roadway noise", "328": "Rail transport", "329": "Train", "330": "Train whistle", "331": "Train horn", "332": "Railroad car, train wagon", "333": "Train wheels squealing", "334": "Subway, metro, underground", "335": "Aircraft", "336": "Aircraft engine", "337": "Jet engine", "338": "Propeller, airscrew", "339": "Helicopter", "340": "Fixed-wing aircraft, airplane", "341": "Bicycle", "342": "Skateboard", "343": "Engine", "344": "Light engine (high frequency)", "345": "Dental drill, dentist's drill", "346": "Lawn mower", "347": "Chainsaw", "348": "Medium engine (mid frequency)", "349": "Heavy engine (low frequency)", "350": "Engine knocking", "351": "Engine starting", "352": "Idling", "353": "Accelerating, revving, vroom", "354": "Door", "355": "Doorbell", "356": "Ding-dong", "357": "Sliding door", "358": "Slam", "359": "Knock", "360": "Tap", "361": "Squeak", "362": "Cupboard open or close", "363": "Drawer open or close", "364": "Dishes, pots, and pans", "365": "Cutlery, silverware", "366": "Chopping (food)", "367": "Frying (food)", "368": "Microwave oven", "369": "Blender", "370": "Water tap, faucet", "371": "Sink (filling or washing)", "372": "Bathtub (filling or washing)", "373": "Hair dryer", "374": "Toilet flush", "375": "Toothbrush", "376": "Electric toothbrush", "377": "Vacuum cleaner", "378": "Zipper (clothing)", "379": "Keys jangling", "380": "Coin (dropping)", "381": "Scissors", "382": "Electric shaver, electric razor", "383": "Shuffling cards", "384": "Typing", "385": "Typewriter", "386": "Computer keyboard", "387": "Writing", "388": "Alarm", "389": "Telephone", "390": "Telephone bell ringing", "391": "Ringtone", "392": "Telephone dialing, DTMF", "393": "Dial tone", "394": "Busy signal", "395": "Alarm clock", "396": "Siren", "397": "Civil defense siren", "398": "Buzzer", "399": "Smoke detector, smoke alarm", "400": "Fire alarm", "401": "Foghorn", "402": "Whistle", "403": "Steam whistle", "404": "Mechanisms", "405": "Ratchet, pawl", "406": "Clock", "407": "Tick", "408": "Tick-tock", "409": "Gears", "410": "Pulleys", "411": "Sewing machine", "412": "Mechanical fan", "413": "Air conditioning", "414": "Cash register", "415": "Printer", "416": "Camera", "417": "Single-lens reflex camera", "418": "Tools", "419": "Hammer", "420": "Jackhammer", "421": "Sawing", "422": "Filing (rasp)", "423": "Sanding", "424": "Power tool", "425": "Drill", "426": "Explosion", "427": "Gunshot, gunfire", "428": "Machine gun", "429": "Fusillade", "430": "Artillery fire", "431": "Cap gun", "432": "Fireworks", "433": "Firecracker", "434": "Burst, pop", "435": "Eruption", "436": "Boom", "437": "Wood", "438": "Chop", "439": "Splinter", "440": "Crack", "441": "Glass", "442": "Chink, clink", "443": "Shatter", "444": "Liquid", "445": "Splash, splatter", "446": "Slosh", "447": "Squish", "448": "Drip", "449": "Pour", "450": "Trickle, dribble", "451": "Gush", "452": "Fill (with liquid)", "453": "Spray", "454": "Pump (liquid)", "455": "Stir", "456": "Boiling", "457": "Sonar", "458": "Arrow", "459": "Whoosh, swoosh, swish", "460": "Thump, thud", "461": "Thunk", "462": "Electronic tuner", "463": "Effects unit", "464": "Chorus effect", "465": "Basketball bounce", "466": "Bang", "467": "Slap, smack", "468": "Whack, thwack", "469": "Smash, crash", "470": "Breaking", "471": "Bouncing", "472": "Whip", "473": "Flap", "474": "Scratch", "475": "Scrape", "476": "Rub", "477": "Roll", "478": "Crushing", "479": "Crumpling, crinkling", "480": "Tearing", "481": "Beep, bleep", "482": "Ping", "483": "Ding", "484": "Clang", "485": "Squeal", "486": "Creak", "487": "Rustle", "488": "Whir", "489": "Clatter", "490": "Sizzle", "491": "Clicking", "492": "Clickety-clack", "493": "Rumble", "494": "Plop", "495": "Jingle, tinkle", "496": "Hum", "497": "Zing", "498": "Boing", "499": "Crunch", "500": "Silence", "501": "Sine wave", "502": "Harmonic", "503": "Chirp tone", "504": "Sound effect", "505": "Pulse", "506": "Inside, small room", "507": "Inside, large room or hall", "508": "Inside, public space", "509": "Outside, urban or manmade", "510": "Outside, rural or natural", "511": "Reverberation", "512": "Echo", "513": "Noise", "514": "Environmental noise", "515": "Static", "516": "Mains hum", "517": "Distortion", "518": "Sidetone", "519": "Cacophony", "520": "White noise", "521": "Pink noise", "522": "Throbbing", "523": "Vibration", "524": "Television", "525": "Radio", "526": "Field recording" }, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "A capella": 255, "Accelerating, revving, vroom": 353, "Accordion": 209, "Acoustic guitar": 143, "Afrobeat": 257, "Air brake": 317, "Air conditioning": 413, "Air horn, truck horn": 318, "Aircraft": 335, "Aircraft engine": 336, "Alarm": 388, "Alarm clock": 395, "Ambient music": 246, "Ambulance (siren)": 324, "Angry music": 281, "Animal": 72, "Applause": 67, "Arrow": 458, "Artillery fire": 430, "Babbling": 6, "Baby cry, infant cry": 23, "Baby laughter": 17, "Background music": 267, "Bagpipes": 210, "Bang": 466, "Banjo": 147, "Bark": 75, "Basketball bounce": 465, "Bass drum": 168, "Bass guitar": 142, "Bathtub (filling or washing)": 372, "Battle cry": 12, "Beatboxing": 218, "Bee, wasp, etc.": 131, "Beep, bleep": 481, "Bell": 200, "Bellow": 9, "Belly laugh": 20, "Bicycle": 341, "Bicycle bell": 203, "Bird": 111, "Bird flight, flapping wings": 121, "Bird vocalization, bird call, bird song": 112, "Biting": 55, "Bleat": 96, "Blender": 369, "Bluegrass": 231, "Blues": 251, "Boat, Water vehicle": 301, "Boiling": 456, "Boing": 498, "Boom": 436, "Bouncing": 471, "Bow-wow": 78, "Bowed string instrument": 189, "Brass instrument": 185, "Breaking": 470, "Breathing": 41, "Burping, eructation": 58, "Burst, pop": 434, "Bus": 321, "Busy signal": 394, "Buzz": 130, "Buzzer": 398, "Cacophony": 519, "Camera": 416, "Canidae, dogs, wolves": 122, "Cap gun": 431, "Car": 307, "Car alarm": 310, "Car passing by": 314, "Carnatic music": 261, "Cash register": 414, "Cat": 81, "Caterwaul": 85, "Cattle, bovinae": 90, "Caw": 118, "Cello": 193, "Chainsaw": 347, "Change ringing (campanology)": 207, "Chant": 30, "Chatter": 68, "Cheering": 66, "Chewing, mastication": 54, "Chicken, rooster": 99, "Child singing": 34, "Child speech, kid speaking": 3, "Children playing": 71, "Children shouting": 13, "Chime": 205, "Chink, clink": 442, "Chirp tone": 503, "Chirp, tweet": 113, "Choir": 28, "Chop": 438, "Chopping (food)": 366, "Chorus effect": 464, "Christian music": 258, "Christmas music": 273, "Chuckle, chortle": 21, "Church bell": 201, "Civil defense siren": 397, "Clang": 484, "Clapping": 63, "Clarinet": 198, "Classical music": 237, "Clatter": 489, "Clickety-clack": 492, "Clicking": 491, "Clip-clop": 88, "Clock": 406, "Cluck": 100, "Coin (dropping)": 380, "Computer keyboard": 386, "Conversation": 4, "Coo": 116, "Cough": 47, "Country": 229, "Cowbell": 92, "Crack": 440, "Crackle": 299, "Creak": 486, "Cricket": 127, "Croak": 133, "Crow": 117, "Crowd": 69, "Crowing, cock-a-doodle-doo": 101, "Crumpling, crinkling": 479, "Crunch": 499, "Crushing": 478, "Crying, sobbing": 22, "Cupboard open or close": 362, "Cutlery, silverware": 365, "Cymbal": 171, "Dance music": 274, "Dental drill, dentist's drill": 345, "Dial tone": 393, "Didgeridoo": 211, "Ding": 483, "Ding-dong": 356, "Disco": 236, "Dishes, pots, and pans": 364, "Distortion": 517, "Dog": 74, "Domestic animals, pets": 73, "Door": 354, "Doorbell": 355, "Double bass": 194, "Drawer open or close": 363, "Drill": 425, "Drip": 448, "Drum": 164, "Drum and bass": 243, "Drum kit": 162, "Drum machine": 163, "Drum roll": 167, "Dubstep": 242, "Duck": 104, "Echo": 512, "Effects unit": 463, "Electric guitar": 141, "Electric piano": 154, "Electric shaver, electric razor": 382, "Electric toothbrush": 376, "Electronic dance music": 245, "Electronic music": 239, "Electronic organ": 156, "Electronic tuner": 462, "Electronica": 244, "Emergency vehicle": 322, "Engine": 343, "Engine knocking": 350, "Engine starting": 351, "Environmental noise": 514, "Eruption": 435, "Exciting music": 280, "Explosion": 426, "Fart": 60, "Female singing": 33, "Female speech, woman speaking": 2, "Field recording": 526, "Filing (rasp)": 422, "Fill (with liquid)": 452, "Finger snapping": 62, "Fire": 298, "Fire alarm": 400, "Fire engine, fire truck (siren)": 325, "Firecracker": 433, "Fireworks": 432, "Fixed-wing aircraft, airplane": 340, "Flamenco": 250, "Flap": 473, "Flute": 196, "Fly, housefly": 129, "Foghorn": 401, "Folk music": 233, "Fowl": 98, "French horn": 186, "Frog": 132, "Frying (food)": 367, "Funk": 232, "Funny music": 277, "Fusillade": 429, "Gargling": 56, "Gasp": 44, "Gears": 409, "Giggle": 18, "Glass": 441, "Glockenspiel": 181, "Goat": 95, "Gobble": 103, "Gong": 177, "Goose": 106, "Gospel music": 259, "Groan": 38, "Growling": 79, "Grunge": 222, "Grunt": 39, "Guitar": 140, "Gunshot, gunfire": 427, "Gurgling": 297, "Gush": 451, "Hair dryer": 373, "Hammer": 419, "Hammond organ": 157, "Hands": 61, "Happy music": 276, "Harmonic": 502, "Harmonica": 208, "Harp": 199, "Harpsichord": 160, "Heart murmur": 65, "Heart sounds, heartbeat": 64, "Heavy engine (low frequency)": 349, "Heavy metal": 220, "Helicopter": 339, "Hi-hat": 172, "Hiccup": 59, "Hip hop music": 217, "Hiss": 84, "Honk": 107, "Hoot": 120, "Horse": 87, "House music": 240, "Howl": 77, "Hubbub, speech noise, speech babble": 70, "Hum": 496, "Humming": 37, "Ice cream truck, ice cream van": 320, "Idling": 352, "Independent music": 265, "Insect": 126, "Inside, large room or hall": 507, "Inside, public space": 508, "Inside, small room": 506, "Jackhammer": 420, "Jazz": 235, "Jet engine": 337, "Jingle (music)": 269, "Jingle bell": 202, "Jingle, tinkle": 495, "Keyboard (musical)": 152, "Keys jangling": 379, "Knock": 359, "Laughter": 16, "Lawn mower": 346, "Light engine (high frequency)": 344, "Liquid": 444, "Livestock, farm animals, working animals": 86, "Lullaby": 271, "Machine gun": 428, "Mains hum": 516, "Male singing": 32, "Male speech, man speaking": 1, "Mallet percussion": 179, "Mandolin": 149, "Mantra": 31, "Maraca": 176, "Marimba, xylophone": 180, "Mechanical fan": 412, "Mechanisms": 404, "Medium engine (mid frequency)": 348, "Meow": 83, "Microwave oven": 368, "Middle Eastern music": 234, "Moo": 91, "Mosquito": 128, "Motor vehicle (road)": 306, "Motorboat, speedboat": 304, "Motorcycle": 326, "Mouse": 124, "Music": 137, "Music for children": 252, "Music of Africa": 256, "Music of Asia": 260, "Music of Bollywood": 262, "Music of Latin America": 248, "Musical instrument": 138, "Narration, monologue": 5, "Neigh, whinny": 89, "New-age music": 253, "Noise": 513, "Ocean": 294, "Oink": 94, "Opera": 238, "Orchestra": 184, "Organ": 155, "Outside, rural or natural": 510, "Outside, urban or manmade": 509, "Owl": 119, "Pant": 45, "Patter": 125, "Percussion": 161, "Piano": 153, "Pig": 93, "Pigeon, dove": 115, "Ping": 482, "Pink noise": 521, "Pizzicato": 192, "Plop": 494, "Plucked string instrument": 139, "Police car (siren)": 323, "Pop music": 216, "Pour": 449, "Power tool": 424, "Power windows, electric windows": 311, "Printer": 415, "Progressive rock": 223, "Propeller, airscrew": 338, "Psychedelic rock": 225, "Pulleys": 410, "Pulse": 505, "Pump (liquid)": 454, "Punk rock": 221, "Purr": 82, "Quack": 105, "Race car, auto racing": 315, "Radio": 525, "Rail transport": 328, "Railroad car, train wagon": 332, "Rain": 289, "Rain on surface": 291, "Raindrop": 290, "Rapping": 36, "Ratchet, pawl": 405, "Rattle": 135, "Rattle (instrument)": 175, "Reggae": 228, "Reverberation": 511, "Reversing beeps": 319, "Rhythm and blues": 226, "Rimshot": 166, "Ringtone": 391, "Roar": 110, "Roaring cats (lions, tigers)": 109, "Rock and roll": 224, "Rock music": 219, "Rodents, rats, mice": 123, "Roll": 477, "Rowboat, canoe, kayak": 303, "Rub": 476, "Rumble": 493, "Run": 51, "Rustle": 487, "Rustling leaves": 284, "Sad music": 278, "Sailboat, sailing ship": 302, "Salsa music": 249, "Sampler": 159, "Sanding": 423, "Sawing": 421, "Saxophone": 197, "Scary music": 282, "Scissors": 381, "Scrape": 475, "Scratch": 474, "Scratching (performance technique)": 215, "Screaming": 14, "Sewing machine": 411, "Shatter": 443, "Sheep": 97, "Ship": 305, "Shofar": 212, "Shout": 8, "Shuffle": 52, "Shuffling cards": 383, "Sidetone": 518, "Sigh": 26, "Silence": 500, "Sine wave": 501, "Singing": 27, "Singing bowl": 214, "Single-lens reflex camera": 417, "Sink (filling or washing)": 371, "Siren": 396, "Sitar": 148, "Sizzle": 490, "Ska": 263, "Skateboard": 342, "Skidding": 312, "Slam": 358, "Slap, smack": 467, "Sliding door": 357, "Slosh": 446, "Smash, crash": 469, "Smoke detector, smoke alarm": 399, "Snake": 134, "Snare drum": 165, "Sneeze": 49, "Snicker": 19, "Sniff": 50, "Snoring": 43, "Snort": 46, "Sonar": 457, "Song": 266, "Soul music": 227, "Sound effect": 504, "Soundtrack music": 270, "Speech": 0, "Speech synthesizer": 7, "Splash, splatter": 445, "Splinter": 439, "Spray": 453, "Squawk": 114, "Squeak": 361, "Squeal": 485, "Squish": 447, "Static": 515, "Steam": 296, "Steam whistle": 403, "Steel guitar, slide guitar": 144, "Steelpan": 183, "Stir": 455, "Stomach rumble": 57, "Stream": 292, "String section": 190, "Strum": 146, "Subway, metro, underground": 334, "Swing music": 230, "Synthesizer": 158, "Synthetic singing": 35, "Tabla": 170, "Tambourine": 174, "Tap": 360, "Tapping (guitar technique)": 145, "Tearing": 480, "Techno": 241, "Telephone": 389, "Telephone bell ringing": 390, "Telephone dialing, DTMF": 392, "Television": 524, "Tender music": 279, "Theme music": 268, "Theremin": 213, "Throat clearing": 48, "Throbbing": 522, "Thump, thud": 460, "Thunder": 287, "Thunderstorm": 286, "Thunk": 461, "Tick": 407, "Tick-tock": 408, "Timpani": 169, "Tire squeal": 313, "Toilet flush": 374, "Tools": 418, "Toot": 309, "Toothbrush": 375, "Traditional music": 264, "Traffic noise, roadway noise": 327, "Train": 329, "Train horn": 331, "Train wheels squealing": 333, "Train whistle": 330, "Trance music": 247, "Trickle, dribble": 450, "Trombone": 188, "Truck": 316, "Trumpet": 187, "Tubular bells": 178, "Tuning fork": 204, "Turkey": 102, "Typewriter": 385, "Typing": 384, "Ukulele": 151, "Vacuum cleaner": 377, "Vehicle": 300, "Vehicle horn, car horn, honking": 308, "Vibraphone": 182, "Vibration": 523, "Video game music": 272, "Violin, fiddle": 191, "Vocal music": 254, "Wail, moan": 25, "Walk, footsteps": 53, "Water": 288, "Water tap, faucet": 370, "Waterfall": 293, "Waves, surf": 295, "Wedding music": 275, "Whack, thwack": 468, "Whale vocalization": 136, "Wheeze": 42, "Whimper": 24, "Whimper (dog)": 80, "Whip": 472, "Whir": 488, "Whispering": 15, "Whistle": 402, "Whistling": 40, "White noise": 520, "Whoop": 10, "Whoosh, swoosh, swish": 459, "Wild animals": 108, "Wind": 283, "Wind chime": 206, "Wind instrument, woodwind instrument": 195, "Wind noise (microphone)": 285, "Wood": 437, "Wood block": 173, "Writing": 387, "Yell": 11, "Yip": 76, "Yodeling": 29, "Zing": 497, "Zipper (clothing)": 378, "Zither": 150 }, "layer_norm_eps": 1e-12, "max_length": 1024, "model_type": "audio-spectrogram-transformer", "num_attention_heads": 12, "num_hidden_layers": 12, "num_mel_bins": 128, "patch_size": 16, "qkv_bias": true, "time_patch_size": 2, "time_stride": 2, "torch_dtype": "float32", "transformers_version": "4.42.3" }