Shikhar Bharadwaj commited on
Commit
8c43a65
·
1 Parent(s): 842ef21

Update model

Browse files
README.md ADDED
@@ -0,0 +1,783 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - classification
6
+ datasets:
7
+ - as2m
8
+ license: cc-by-4.0
9
+ ---
10
+
11
+ ## ESPnet2 CLS model
12
+
13
+ ### `espnet/OpenBEATS-Large-i2-as2m`
14
+
15
+ This model was trained by Shikhar Bharadwaj using as2m recipe in [espnet](https://github.com/espnet/espnet/).
16
+
17
+ ## CLS config
18
+
19
+ <details><summary>expand</summary>
20
+
21
+ ```
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge2.final/conf/ear_large/audioset2m.yaml
23
+ print_config: false
24
+ log_level: INFO
25
+ drop_last_iter: false
26
+ dry_run: false
27
+ iterator_type: sequence
28
+ valid_iterator_type: null
29
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
30
+ ngpu: 0
31
+ seed: 0
32
+ num_workers: 2
33
+ num_att_plot: 0
34
+ dist_backend: nccl
35
+ dist_init_method: env://
36
+ dist_world_size: null
37
+ dist_rank: null
38
+ local_rank: null
39
+ dist_master_addr: null
40
+ dist_master_port: null
41
+ dist_launcher: null
42
+ multiprocessing_distributed: false
43
+ unused_parameters: true
44
+ sharded_ddp: false
45
+ use_deepspeed: false
46
+ deepspeed_config: null
47
+ gradient_as_bucket_view: true
48
+ ddp_comm_hook: null
49
+ cudnn_enabled: true
50
+ cudnn_benchmark: false
51
+ cudnn_deterministic: true
52
+ use_tf32: false
53
+ collect_stats: false
54
+ write_collected_feats: false
55
+ max_epoch: 25
56
+ patience: null
57
+ val_scheduler_criterion:
58
+ - valid
59
+ - loss
60
+ early_stopping_criterion:
61
+ - valid
62
+ - loss
63
+ - min
64
+ best_model_criterion:
65
+ - - valid
66
+ - epoch_mAP
67
+ - max
68
+ keep_nbest_models: 1
69
+ nbest_averaging_interval: 0
70
+ grad_clip: 1
71
+ grad_clip_type: 2.0
72
+ grad_noise: false
73
+ accum_grad: 2
74
+ no_forward_run: false
75
+ resume: true
76
+ train_dtype: float32
77
+ use_amp: false
78
+ log_interval: null
79
+ use_matplotlib: true
80
+ use_tensorboard: true
81
+ create_graph_in_tensorboard: false
82
+ use_wandb: true
83
+ wandb_project: audioverse
84
+ wandb_id: null
85
+ wandb_entity: shikhar
86
+ wandb_name: audioset2m.earlarge2.final
87
+ wandb_model_log_interval: -1
88
+ detect_anomaly: false
89
+ use_adapter: false
90
+ adapter: lora
91
+ save_strategy: all
92
+ adapter_conf: {}
93
+ pretrain_path: null
94
+ init_param: []
95
+ ignore_init_mismatch: false
96
+ freeze_param: []
97
+ num_iters_per_epoch: null
98
+ batch_size: 20
99
+ valid_batch_size: null
100
+ batch_bins: 20000000
101
+ valid_batch_bins: null
102
+ category_sample_size: 10
103
+ train_shape_file:
104
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/speech_shape
105
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/label_shape
106
+ valid_shape_file:
107
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/speech_shape
108
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/label_shape
109
+ batch_type: length_weighted
110
+ valid_batch_type: length
111
+ fold_length:
112
+ - 160000
113
+ - 600
114
+ sort_in_batch: descending
115
+ shuffle_within_batch: false
116
+ sort_batch: descending
117
+ multiple_iterator: false
118
+ utt2weight_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/audioset2m/utt2weight
119
+ chunk_length: 500
120
+ chunk_shift_ratio: 0.5
121
+ num_cache_chunks: 1024
122
+ chunk_excluded_key_prefixes: []
123
+ chunk_default_fs: null
124
+ chunk_max_abs_length: null
125
+ chunk_discard_short_samples: true
126
+ train_data_path_and_name_and_type:
127
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/wav.scp
128
+ - speech
129
+ - sound
130
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/text
131
+ - label
132
+ - text
133
+ valid_data_path_and_name_and_type:
134
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/wav.scp
135
+ - speech
136
+ - sound
137
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/text
138
+ - label
139
+ - text
140
+ multi_task_dataset: false
141
+ allow_variable_data_keys: false
142
+ max_cache_size: 0.0
143
+ max_cache_fd: 32
144
+ allow_multi_rates: false
145
+ valid_max_cache_size: null
146
+ exclude_weight_decay: false
147
+ exclude_weight_decay_conf: {}
148
+ optim: adamw
149
+ optim_conf:
150
+ lr: 0.0001
151
+ weight_decay: 0.01
152
+ betas:
153
+ - 0.9
154
+ - 0.98
155
+ scheduler: cosineannealingwarmuprestarts
156
+ scheduler_conf:
157
+ first_cycle_steps: 400000
158
+ warmup_steps: 10000
159
+ max_lr: 0.0001
160
+ min_lr: 5.0e-06
161
+ lightning_conf:
162
+ log_every_n_steps: 250
163
+ max_epochs: 25
164
+ strategy: ddp
165
+ strategy_conf:
166
+ find_unused_parameters: true
167
+ best_model_criterion:
168
+ - - valid/epoch_mAP
169
+ - max
170
+ - 1
171
+ devices: 4
172
+ num_nodes: 1
173
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
174
+ token_list:
175
+ - Music
176
+ - Speech
177
+ - Vehicle
178
+ - Musical_instrument
179
+ - Inside,_small_room
180
+ - Guitar
181
+ - Plucked_string_instrument
182
+ - Singing
183
+ - Car
184
+ - Animal
185
+ - Electronic_music
186
+ - Outside,_rural_or_natural
187
+ - Outside,_urban_or_manmade
188
+ - Violin,_fiddle
189
+ - Inside,_large_room_or_hall
190
+ - Bird
191
+ - Drum
192
+ - Domestic_animals,_pets
193
+ - Dubstep
194
+ - Male_speech,_man_speaking
195
+ - Techno
196
+ - Percussion
197
+ - Engine
198
+ - Narration,_monologue
199
+ - Drum_kit
200
+ - Acoustic_guitar
201
+ - Strum
202
+ - Dog
203
+ - Boat,_Water_vehicle
204
+ - Train
205
+ - Electric_guitar
206
+ - Accelerating,_revving,_vroom
207
+ - Piano
208
+ - Child_speech,_kid_speaking
209
+ - Truck
210
+ - Keyboard_(musical)
211
+ - Crowd
212
+ - Bowed_string_instrument
213
+ - Bass_drum
214
+ - Rock_and_roll
215
+ - Motor_vehicle_(road)
216
+ - Pop_music
217
+ - Rail_transport
218
+ - Pigeon,_dove
219
+ - Water
220
+ - Female_speech,_woman_speaking
221
+ - Orchestra
222
+ - Rock_music
223
+ - Sound_effect
224
+ - Motorboat,_speedboat
225
+ - Railroad_car,_train_wagon
226
+ - Siren
227
+ - Tools
228
+ - Female_singing
229
+ - Hip_hop_music
230
+ - Silence
231
+ - Motorcycle
232
+ - Male_singing
233
+ - Brass_instrument
234
+ - Classical_music
235
+ - Snare_drum
236
+ - Inside,_public_space
237
+ - Choir
238
+ - Soundtrack_music
239
+ - House_music
240
+ - Wind
241
+ - Race_car,_auto_racing
242
+ - Heavy_metal
243
+ - Bass_guitar
244
+ - Chicken,_rooster
245
+ - Wind_noise_(microphone)
246
+ - Idling
247
+ - Medium_engine_(mid_frequency)
248
+ - Bird_vocalization,_bird_call,_bird_song
249
+ - Fowl
250
+ - Wind_instrument,_woodwind_instrument
251
+ - Exciting_music
252
+ - Country
253
+ - Laughter
254
+ - Marimba,_xylophone
255
+ - Sampler
256
+ - Emergency_vehicle
257
+ - Aircraft
258
+ - Electronica
259
+ - Ukulele
260
+ - Cello
261
+ - Bus
262
+ - Rhythm_and_blues
263
+ - Synthesizer
264
+ - Background_music
265
+ - Jazz
266
+ - Mantra
267
+ - Dance_music
268
+ - Flute
269
+ - Blues
270
+ - Effects_unit
271
+ - Electric_piano
272
+ - Cymbal
273
+ - Chirp,_tweet
274
+ - Rimshot
275
+ - Rapping
276
+ - Trance_music
277
+ - Livestock,_farm_animals,_working_animals
278
+ - Run
279
+ - Cheering
280
+ - Electronic_dance_music
281
+ - Theme_music
282
+ - Gospel_music
283
+ - Heavy_engine_(low_frequency)
284
+ - Radio
285
+ - Music_of_Latin_America
286
+ - Disco
287
+ - Glockenspiel
288
+ - Tender_music
289
+ - Punk_rock
290
+ - Funk
291
+ - Gunshot,_gunfire
292
+ - Hi-hat
293
+ - Bow-wow
294
+ - Cat
295
+ - Music_of_Asia
296
+ - Trumpet
297
+ - Car_passing_by
298
+ - Drum_and_bass
299
+ - Vehicle_horn,_car_horn,_honking
300
+ - Helicopter
301
+ - Vocal_music
302
+ - Police_car_(siren)
303
+ - Pizzicato
304
+ - Progressive_rock
305
+ - Tap
306
+ - Printer
307
+ - Video_game_music
308
+ - Music_for_children
309
+ - Clip-clop
310
+ - Ocean
311
+ - Drum_machine
312
+ - Rain
313
+ - Horse
314
+ - Power_tool
315
+ - Wood
316
+ - Lullaby
317
+ - Mallet_percussion
318
+ - Reggae
319
+ - Fire_engine,_fire_truck_(siren)
320
+ - New-age_music
321
+ - Christian_music
322
+ - Spray
323
+ - Saxophone
324
+ - Fireworks
325
+ - Skateboard
326
+ - Independent_music
327
+ - Fixed-wing_aircraft,_airplane
328
+ - Drum_roll
329
+ - Insect
330
+ - Bicycle
331
+ - Coo
332
+ - Tick-tock
333
+ - Accordion
334
+ - Scratching_(performance_technique)
335
+ - Soul_music
336
+ - Rain_on_surface
337
+ - Stream
338
+ - Rowboat,_canoe,_kayak
339
+ - Organ
340
+ - Ambient_music
341
+ - Steel_guitar,_slide_guitar
342
+ - Waves,_surf
343
+ - Distortion
344
+ - Music_of_Africa
345
+ - Trombone
346
+ - Bluegrass
347
+ - Sailboat,_sailing_ship
348
+ - Afrobeat
349
+ - Sheep
350
+ - Child_singing
351
+ - Bark
352
+ - Duck
353
+ - Salsa_music
354
+ - Quack
355
+ - Door
356
+ - Hiss
357
+ - Cluck
358
+ - Beatboxing
359
+ - Music_of_Bollywood
360
+ - Banjo
361
+ - Vibration
362
+ - Water_tap,_faucet
363
+ - Baby_cry,_infant_cry
364
+ - Mandolin
365
+ - Yip
366
+ - Flamenco
367
+ - Snoring
368
+ - Psychedelic_rock
369
+ - Opera
370
+ - Explosion
371
+ - Double_bass
372
+ - Burst,_pop
373
+ - Toilet_flush
374
+ - Conversation
375
+ - Applause
376
+ - Television
377
+ - Harmonica
378
+ - Train_horn
379
+ - Folk_music
380
+ - Wood_block
381
+ - Christmas_music
382
+ - Clarinet
383
+ - Steam
384
+ - Typing
385
+ - Crowing,_cock-a-doodle-doo
386
+ - Bleat
387
+ - Basketball_bounce
388
+ - Middle_Eastern_music
389
+ - Goat
390
+ - Roll
391
+ - Harp
392
+ - Harpsichord
393
+ - Computer_keyboard
394
+ - Grunge
395
+ - Vacuum_cleaner
396
+ - Whistling
397
+ - Drill
398
+ - Clickety-clack
399
+ - Meow
400
+ - Chatter
401
+ - Gurgling
402
+ - A_capella
403
+ - Civil_defense_siren
404
+ - Whoop
405
+ - Sewing_machine
406
+ - Bee,_wasp,_etc.
407
+ - Ambulance_(siren)
408
+ - Machine_gun
409
+ - Bell
410
+ - Honk
411
+ - Snicker
412
+ - Whimper_(dog)
413
+ - Dishes,_pots,_and_pans
414
+ - Tapping_(guitar_technique)
415
+ - White_noise
416
+ - Sad_music
417
+ - Thump,_thud
418
+ - Chainsaw
419
+ - Goose
420
+ - Bagpipes
421
+ - Oink
422
+ - Tick
423
+ - Song
424
+ - Traditional_music
425
+ - Tabla
426
+ - Scary_music
427
+ - Subway,_metro,_underground
428
+ - Rustling_leaves
429
+ - Liquid
430
+ - Ska
431
+ - Chant
432
+ - Speech_synthesizer
433
+ - Canidae,_dogs,_wolves
434
+ - Walk,_footsteps
435
+ - Mechanisms
436
+ - Chuckle,_chortle
437
+ - Traffic_noise,_roadway_noise
438
+ - Bathtub_(filling_or_washing)
439
+ - Boom
440
+ - Carnatic_music
441
+ - Fusillade
442
+ - Swing_music
443
+ - Whispering
444
+ - Lawn_mower
445
+ - Crumpling,_crinkling
446
+ - Frying_(food)
447
+ - Beep,_bleep
448
+ - Sitar
449
+ - Tire_squeal
450
+ - Whack,_thwack
451
+ - Sink_(filling_or_washing)
452
+ - Skidding
453
+ - Vibraphone
454
+ - Rub
455
+ - Waterfall
456
+ - Crackle
457
+ - Rustle
458
+ - Smash,_crash
459
+ - Hubbub,_speech_noise,_speech_babble
460
+ - Happy_music
461
+ - Electronic_tuner
462
+ - Crying,_sobbing
463
+ - Fire
464
+ - Jingle_(music)
465
+ - Fly,_housefly
466
+ - Hammond_organ
467
+ - Whoosh,_swoosh,_swish
468
+ - Hum
469
+ - Timpani
470
+ - Shout
471
+ - Clatter
472
+ - Trickle,_dribble
473
+ - Sizzle
474
+ - Electronic_organ
475
+ - Burping,_eructation
476
+ - Firecracker
477
+ - Throbbing
478
+ - Telephone
479
+ - Thunder
480
+ - Echo
481
+ - Blender
482
+ - Thunderstorm
483
+ - Frog
484
+ - Whimper
485
+ - Didgeridoo
486
+ - Environmental_noise
487
+ - Snake
488
+ - Raindrop
489
+ - Fart
490
+ - French_horn
491
+ - Slosh
492
+ - Zither
493
+ - Screaming
494
+ - Jingle,_tinkle
495
+ - Church_bell
496
+ - String_section
497
+ - Sneeze
498
+ - Jingle_bell
499
+ - Aircraft_engine
500
+ - Funny_music
501
+ - Angry_music
502
+ - Wild_animals
503
+ - Turkey
504
+ - Engine_starting
505
+ - Rattle
506
+ - Arrow
507
+ - Ringtone
508
+ - Propeller,_airscrew
509
+ - Sliding_door
510
+ - Heart_sounds,_heartbeat
511
+ - Pink_noise
512
+ - Steelpan
513
+ - Giggle
514
+ - Pig
515
+ - Buzzer
516
+ - Artillery_fire
517
+ - Splash,_splatter
518
+ - Roar
519
+ - Fire_alarm
520
+ - Hiccup
521
+ - Gobble
522
+ - Air_brake
523
+ - Chime
524
+ - Plop
525
+ - Singing_bowl
526
+ - Cattle,_bovinae
527
+ - Cutlery,_silverware
528
+ - Slap,_smack
529
+ - Cough
530
+ - Ship
531
+ - Reverberation
532
+ - Babbling
533
+ - Cacophony
534
+ - Electric_shaver,_electric_razor
535
+ - Baby_laughter
536
+ - Cricket
537
+ - Crow
538
+ - Writing
539
+ - Glass
540
+ - Howl
541
+ - Slam
542
+ - Belly_laugh
543
+ - Mechanical_fan
544
+ - Breathing
545
+ - Children_playing
546
+ - Chewing,_mastication
547
+ - Microwave_oven
548
+ - Stir
549
+ - Wedding_music
550
+ - Ping
551
+ - Sawing
552
+ - Eruption
553
+ - Steam_whistle
554
+ - Yell
555
+ - Clapping
556
+ - Alarm
557
+ - Chink,_clink
558
+ - Growling
559
+ - Roaring_cats_(lions,_tigers)
560
+ - Drip
561
+ - Air_horn,_truck_horn
562
+ - Clicking
563
+ - Toot
564
+ - Ding
565
+ - Harmonic
566
+ - Jet_engine
567
+ - Moo
568
+ - Tambourine
569
+ - Train_whistle
570
+ - Clock
571
+ - Children_shouting
572
+ - Engine_knocking
573
+ - Patter
574
+ - Ratchet,_pawl
575
+ - Filing_(rasp)
576
+ - Groan
577
+ - Telephone_bell_ringing
578
+ - Change_ringing_(campanology)
579
+ - Cap_gun
580
+ - Whistle
581
+ - Synthetic_singing
582
+ - Rodents,_rats,_mice
583
+ - Fill_(with_liquid)
584
+ - Owl
585
+ - Theremin
586
+ - Typewriter
587
+ - Gears
588
+ - Pump_(liquid)
589
+ - Caw
590
+ - Maraca
591
+ - Neigh,_whinny
592
+ - Alarm_clock
593
+ - Wind_chime
594
+ - Shuffle
595
+ - Smoke_detector,_smoke_alarm
596
+ - Train_wheels_squealing
597
+ - Purr
598
+ - Static
599
+ - Caterwaul
600
+ - Snort
601
+ - Bellow
602
+ - Tubular_bells
603
+ - Yodeling
604
+ - Keys_jangling
605
+ - Battle_cry
606
+ - Hammer
607
+ - Rumble
608
+ - Gong
609
+ - Single-lens_reflex_camera
610
+ - Flap
611
+ - Boing
612
+ - Car_alarm
613
+ - Ding-dong
614
+ - Breaking
615
+ - Sine_wave
616
+ - Telephone_dialing,_DTMF
617
+ - Whip
618
+ - Boiling
619
+ - Buzz
620
+ - Mains_hum
621
+ - Chop
622
+ - Heart_murmur
623
+ - Light_engine_(high_frequency)
624
+ - Hands
625
+ - Camera
626
+ - Humming
627
+ - Thunk
628
+ - Rattle_(instrument)
629
+ - Scrape
630
+ - Gush
631
+ - Mouse
632
+ - Air_conditioning
633
+ - Scratch
634
+ - Pour
635
+ - Coin_(dropping)
636
+ - Hair_dryer
637
+ - Chorus_effect
638
+ - Croak
639
+ - Squish
640
+ - Dial_tone
641
+ - Crunch
642
+ - Tearing
643
+ - Field_recording
644
+ - Cash_register
645
+ - Zipper_(clothing)
646
+ - Bang
647
+ - Shatter
648
+ - Biting
649
+ - Throat_clearing
650
+ - Bird_flight,_flapping_wings
651
+ - Noise
652
+ - Mosquito
653
+ - Reversing_beeps
654
+ - Sonar
655
+ - Grunt
656
+ - Shofar
657
+ - Doorbell
658
+ - Ice_cream_truck,_ice_cream_van
659
+ - Gasp
660
+ - Shuffling_cards
661
+ - Knock
662
+ - Sigh
663
+ - Scissors
664
+ - Clang
665
+ - Cowbell
666
+ - Pant
667
+ - Electric_toothbrush
668
+ - Jackhammer
669
+ - Busy_signal
670
+ - Drawer_open_or_close
671
+ - Whir
672
+ - Power_windows,_electric_windows
673
+ - Crack
674
+ - Whale_vocalization
675
+ - Zing
676
+ - Stomach_rumble
677
+ - Wail,_moan
678
+ - Bouncing
679
+ - Pulse
680
+ - Foghorn
681
+ - Bicycle_bell
682
+ - Sniff
683
+ - Chirp_tone
684
+ - Squeak
685
+ - Cupboard_open_or_close
686
+ - Sanding
687
+ - Sidetone
688
+ - Wheeze
689
+ - Squawk
690
+ - Squeal
691
+ - Splinter
692
+ - Dental_drill,_dentist's_drill
693
+ - Finger_snapping
694
+ - Chopping_(food)
695
+ - Tuning_fork
696
+ - Gargling
697
+ - Pulleys
698
+ - Toothbrush
699
+ - Creak
700
+ - Crushing
701
+ - Hoot
702
+ - <blank>
703
+ - <unk>
704
+ text_token_list: null
705
+ text_bpemodel: null
706
+ init: xavier_normal
707
+ input_size: 1
708
+ use_preprocessor: true
709
+ frontend: null
710
+ frontend_conf: {}
711
+ specaug: null
712
+ specaug_conf: {}
713
+ normalize: null
714
+ normalize_conf: {}
715
+ preencoder: null
716
+ preencoder_conf: {}
717
+ encoder: beats
718
+ encoder_conf:
719
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
720
+ beats_config:
721
+ layer_wise_gradient_decay_ratio: 0.6
722
+ encoder_layerdrop: 0.1
723
+ dropout: 0.0
724
+ use_weighted_representation: false
725
+ specaug_config:
726
+ apply_time_warp: true
727
+ apply_freq_mask: false
728
+ apply_time_mask: true
729
+ time_mask_width_ratio_range:
730
+ - 0
731
+ - 0.06
732
+ num_time_mask: 1
733
+ roll_augment: true
734
+ roll_interval: 1
735
+ text_encoder: null
736
+ text_encoder_conf: {}
737
+ embedding_fusion: null
738
+ embedding_fusion_conf: {}
739
+ decoder: linear
740
+ decoder_conf: {}
741
+ model: espnet
742
+ model_conf:
743
+ classification_type: multi-label
744
+ mixup_probability: 0.8
745
+ lsm_weight: 0.1
746
+ log_epoch_metrics: true
747
+ user_callbacks:
748
+ - mAP_logging
749
+ required:
750
+ - output_dir
751
+ - token_list
752
+ task: cls
753
+ ```
754
+
755
+ </details>
756
+
757
+ ### Citations
758
+
759
+ ```BibTex
760
+
761
+ @article{bharadwaj2025openbeats,
762
+ title={OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder},
763
+ author={Bharadwaj, Shikhar and Cornell, Samuele and Choi, Kwanghee and Fukayama, Satoru and Shim, Hye-jin and Deshmukh, Soham and Watanabe, Shinji},
764
+ journal={arXiv preprint arXiv:2507.14129},
765
+ year={2025}
766
+ }
767
+
768
+ @inproceedings{watanabe2018espnet,
769
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
770
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
771
+ year={2018},
772
+ booktitle={Proceedings of Interspeech},
773
+ pages={2207--2211},
774
+ doi={10.21437/Interspeech.2018-1456},
775
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
776
+ }
777
+
778
+
779
+
780
+
781
+
782
+
783
+ ```
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202503'
2
+ files:
3
+ classification_model_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/valid.epoch_mAP.ave_1best.pth
4
+ python: "3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) \n[GCC 12.3.0]"
5
+ timestamp: 1763330432.365711
6
+ torch: 2.1.2
7
+ yaml_files:
8
+ classification_train_config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/config.yaml
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/audioset2m/token_list ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Music
2
+ Speech
3
+ Vehicle
4
+ Musical_instrument
5
+ Inside,_small_room
6
+ Guitar
7
+ Plucked_string_instrument
8
+ Singing
9
+ Car
10
+ Animal
11
+ Electronic_music
12
+ Outside,_rural_or_natural
13
+ Outside,_urban_or_manmade
14
+ Violin,_fiddle
15
+ Inside,_large_room_or_hall
16
+ Bird
17
+ Drum
18
+ Domestic_animals,_pets
19
+ Dubstep
20
+ Male_speech,_man_speaking
21
+ Techno
22
+ Percussion
23
+ Engine
24
+ Narration,_monologue
25
+ Drum_kit
26
+ Acoustic_guitar
27
+ Strum
28
+ Dog
29
+ Boat,_Water_vehicle
30
+ Train
31
+ Electric_guitar
32
+ Accelerating,_revving,_vroom
33
+ Piano
34
+ Child_speech,_kid_speaking
35
+ Truck
36
+ Keyboard_(musical)
37
+ Crowd
38
+ Bowed_string_instrument
39
+ Bass_drum
40
+ Rock_and_roll
41
+ Motor_vehicle_(road)
42
+ Pop_music
43
+ Rail_transport
44
+ Pigeon,_dove
45
+ Water
46
+ Female_speech,_woman_speaking
47
+ Orchestra
48
+ Rock_music
49
+ Sound_effect
50
+ Motorboat,_speedboat
51
+ Railroad_car,_train_wagon
52
+ Siren
53
+ Tools
54
+ Female_singing
55
+ Hip_hop_music
56
+ Silence
57
+ Motorcycle
58
+ Male_singing
59
+ Brass_instrument
60
+ Classical_music
61
+ Snare_drum
62
+ Inside,_public_space
63
+ Choir
64
+ Soundtrack_music
65
+ House_music
66
+ Wind
67
+ Race_car,_auto_racing
68
+ Heavy_metal
69
+ Bass_guitar
70
+ Chicken,_rooster
71
+ Wind_noise_(microphone)
72
+ Idling
73
+ Medium_engine_(mid_frequency)
74
+ Bird_vocalization,_bird_call,_bird_song
75
+ Fowl
76
+ Wind_instrument,_woodwind_instrument
77
+ Exciting_music
78
+ Country
79
+ Laughter
80
+ Marimba,_xylophone
81
+ Sampler
82
+ Emergency_vehicle
83
+ Aircraft
84
+ Electronica
85
+ Ukulele
86
+ Cello
87
+ Bus
88
+ Rhythm_and_blues
89
+ Synthesizer
90
+ Background_music
91
+ Jazz
92
+ Mantra
93
+ Dance_music
94
+ Flute
95
+ Blues
96
+ Effects_unit
97
+ Electric_piano
98
+ Cymbal
99
+ Chirp,_tweet
100
+ Rimshot
101
+ Rapping
102
+ Trance_music
103
+ Livestock,_farm_animals,_working_animals
104
+ Run
105
+ Cheering
106
+ Electronic_dance_music
107
+ Theme_music
108
+ Gospel_music
109
+ Heavy_engine_(low_frequency)
110
+ Radio
111
+ Music_of_Latin_America
112
+ Disco
113
+ Glockenspiel
114
+ Tender_music
115
+ Punk_rock
116
+ Funk
117
+ Gunshot,_gunfire
118
+ Hi-hat
119
+ Bow-wow
120
+ Cat
121
+ Music_of_Asia
122
+ Trumpet
123
+ Car_passing_by
124
+ Drum_and_bass
125
+ Vehicle_horn,_car_horn,_honking
126
+ Helicopter
127
+ Vocal_music
128
+ Police_car_(siren)
129
+ Pizzicato
130
+ Progressive_rock
131
+ Tap
132
+ Printer
133
+ Video_game_music
134
+ Music_for_children
135
+ Clip-clop
136
+ Ocean
137
+ Drum_machine
138
+ Rain
139
+ Horse
140
+ Power_tool
141
+ Wood
142
+ Lullaby
143
+ Mallet_percussion
144
+ Reggae
145
+ Fire_engine,_fire_truck_(siren)
146
+ New-age_music
147
+ Christian_music
148
+ Spray
149
+ Saxophone
150
+ Fireworks
151
+ Skateboard
152
+ Independent_music
153
+ Fixed-wing_aircraft,_airplane
154
+ Drum_roll
155
+ Insect
156
+ Bicycle
157
+ Coo
158
+ Tick-tock
159
+ Accordion
160
+ Scratching_(performance_technique)
161
+ Soul_music
162
+ Rain_on_surface
163
+ Stream
164
+ Rowboat,_canoe,_kayak
165
+ Organ
166
+ Ambient_music
167
+ Steel_guitar,_slide_guitar
168
+ Waves,_surf
169
+ Distortion
170
+ Music_of_Africa
171
+ Trombone
172
+ Bluegrass
173
+ Sailboat,_sailing_ship
174
+ Afrobeat
175
+ Sheep
176
+ Child_singing
177
+ Bark
178
+ Duck
179
+ Salsa_music
180
+ Quack
181
+ Door
182
+ Hiss
183
+ Cluck
184
+ Beatboxing
185
+ Music_of_Bollywood
186
+ Banjo
187
+ Vibration
188
+ Water_tap,_faucet
189
+ Baby_cry,_infant_cry
190
+ Mandolin
191
+ Yip
192
+ Flamenco
193
+ Snoring
194
+ Psychedelic_rock
195
+ Opera
196
+ Explosion
197
+ Double_bass
198
+ Burst,_pop
199
+ Toilet_flush
200
+ Conversation
201
+ Applause
202
+ Television
203
+ Harmonica
204
+ Train_horn
205
+ Folk_music
206
+ Wood_block
207
+ Christmas_music
208
+ Clarinet
209
+ Steam
210
+ Typing
211
+ Crowing,_cock-a-doodle-doo
212
+ Bleat
213
+ Basketball_bounce
214
+ Middle_Eastern_music
215
+ Goat
216
+ Roll
217
+ Harp
218
+ Harpsichord
219
+ Computer_keyboard
220
+ Grunge
221
+ Vacuum_cleaner
222
+ Whistling
223
+ Drill
224
+ Clickety-clack
225
+ Meow
226
+ Chatter
227
+ Gurgling
228
+ A_capella
229
+ Civil_defense_siren
230
+ Whoop
231
+ Sewing_machine
232
+ Bee,_wasp,_etc.
233
+ Ambulance_(siren)
234
+ Machine_gun
235
+ Bell
236
+ Honk
237
+ Snicker
238
+ Whimper_(dog)
239
+ Dishes,_pots,_and_pans
240
+ Tapping_(guitar_technique)
241
+ White_noise
242
+ Sad_music
243
+ Thump,_thud
244
+ Chainsaw
245
+ Goose
246
+ Bagpipes
247
+ Oink
248
+ Tick
249
+ Song
250
+ Traditional_music
251
+ Tabla
252
+ Scary_music
253
+ Subway,_metro,_underground
254
+ Rustling_leaves
255
+ Liquid
256
+ Ska
257
+ Chant
258
+ Speech_synthesizer
259
+ Canidae,_dogs,_wolves
260
+ Walk,_footsteps
261
+ Mechanisms
262
+ Chuckle,_chortle
263
+ Traffic_noise,_roadway_noise
264
+ Bathtub_(filling_or_washing)
265
+ Boom
266
+ Carnatic_music
267
+ Fusillade
268
+ Swing_music
269
+ Whispering
270
+ Lawn_mower
271
+ Crumpling,_crinkling
272
+ Frying_(food)
273
+ Beep,_bleep
274
+ Sitar
275
+ Tire_squeal
276
+ Whack,_thwack
277
+ Sink_(filling_or_washing)
278
+ Skidding
279
+ Vibraphone
280
+ Rub
281
+ Waterfall
282
+ Crackle
283
+ Rustle
284
+ Smash,_crash
285
+ Hubbub,_speech_noise,_speech_babble
286
+ Happy_music
287
+ Electronic_tuner
288
+ Crying,_sobbing
289
+ Fire
290
+ Jingle_(music)
291
+ Fly,_housefly
292
+ Hammond_organ
293
+ Whoosh,_swoosh,_swish
294
+ Hum
295
+ Timpani
296
+ Shout
297
+ Clatter
298
+ Trickle,_dribble
299
+ Sizzle
300
+ Electronic_organ
301
+ Burping,_eructation
302
+ Firecracker
303
+ Throbbing
304
+ Telephone
305
+ Thunder
306
+ Echo
307
+ Blender
308
+ Thunderstorm
309
+ Frog
310
+ Whimper
311
+ Didgeridoo
312
+ Environmental_noise
313
+ Snake
314
+ Raindrop
315
+ Fart
316
+ French_horn
317
+ Slosh
318
+ Zither
319
+ Screaming
320
+ Jingle,_tinkle
321
+ Church_bell
322
+ String_section
323
+ Sneeze
324
+ Jingle_bell
325
+ Aircraft_engine
326
+ Funny_music
327
+ Angry_music
328
+ Wild_animals
329
+ Turkey
330
+ Engine_starting
331
+ Rattle
332
+ Arrow
333
+ Ringtone
334
+ Propeller,_airscrew
335
+ Sliding_door
336
+ Heart_sounds,_heartbeat
337
+ Pink_noise
338
+ Steelpan
339
+ Giggle
340
+ Pig
341
+ Buzzer
342
+ Artillery_fire
343
+ Splash,_splatter
344
+ Roar
345
+ Fire_alarm
346
+ Hiccup
347
+ Gobble
348
+ Air_brake
349
+ Chime
350
+ Plop
351
+ Singing_bowl
352
+ Cattle,_bovinae
353
+ Cutlery,_silverware
354
+ Slap,_smack
355
+ Cough
356
+ Ship
357
+ Reverberation
358
+ Babbling
359
+ Cacophony
360
+ Electric_shaver,_electric_razor
361
+ Baby_laughter
362
+ Cricket
363
+ Crow
364
+ Writing
365
+ Glass
366
+ Howl
367
+ Slam
368
+ Belly_laugh
369
+ Mechanical_fan
370
+ Breathing
371
+ Children_playing
372
+ Chewing,_mastication
373
+ Microwave_oven
374
+ Stir
375
+ Wedding_music
376
+ Ping
377
+ Sawing
378
+ Eruption
379
+ Steam_whistle
380
+ Yell
381
+ Clapping
382
+ Alarm
383
+ Chink,_clink
384
+ Growling
385
+ Roaring_cats_(lions,_tigers)
386
+ Drip
387
+ Air_horn,_truck_horn
388
+ Clicking
389
+ Toot
390
+ Ding
391
+ Harmonic
392
+ Jet_engine
393
+ Moo
394
+ Tambourine
395
+ Train_whistle
396
+ Clock
397
+ Children_shouting
398
+ Engine_knocking
399
+ Patter
400
+ Ratchet,_pawl
401
+ Filing_(rasp)
402
+ Groan
403
+ Telephone_bell_ringing
404
+ Change_ringing_(campanology)
405
+ Cap_gun
406
+ Whistle
407
+ Synthetic_singing
408
+ Rodents,_rats,_mice
409
+ Fill_(with_liquid)
410
+ Owl
411
+ Theremin
412
+ Typewriter
413
+ Gears
414
+ Pump_(liquid)
415
+ Caw
416
+ Maraca
417
+ Neigh,_whinny
418
+ Alarm_clock
419
+ Wind_chime
420
+ Shuffle
421
+ Smoke_detector,_smoke_alarm
422
+ Train_wheels_squealing
423
+ Purr
424
+ Static
425
+ Caterwaul
426
+ Snort
427
+ Bellow
428
+ Tubular_bells
429
+ Yodeling
430
+ Keys_jangling
431
+ Battle_cry
432
+ Hammer
433
+ Rumble
434
+ Gong
435
+ Single-lens_reflex_camera
436
+ Flap
437
+ Boing
438
+ Car_alarm
439
+ Ding-dong
440
+ Breaking
441
+ Sine_wave
442
+ Telephone_dialing,_DTMF
443
+ Whip
444
+ Boiling
445
+ Buzz
446
+ Mains_hum
447
+ Chop
448
+ Heart_murmur
449
+ Light_engine_(high_frequency)
450
+ Hands
451
+ Camera
452
+ Humming
453
+ Thunk
454
+ Rattle_(instrument)
455
+ Scrape
456
+ Gush
457
+ Mouse
458
+ Air_conditioning
459
+ Scratch
460
+ Pour
461
+ Coin_(dropping)
462
+ Hair_dryer
463
+ Chorus_effect
464
+ Croak
465
+ Squish
466
+ Dial_tone
467
+ Crunch
468
+ Tearing
469
+ Field_recording
470
+ Cash_register
471
+ Zipper_(clothing)
472
+ Bang
473
+ Shatter
474
+ Biting
475
+ Throat_clearing
476
+ Bird_flight,_flapping_wings
477
+ Noise
478
+ Mosquito
479
+ Reversing_beeps
480
+ Sonar
481
+ Grunt
482
+ Shofar
483
+ Doorbell
484
+ Ice_cream_truck,_ice_cream_van
485
+ Gasp
486
+ Shuffling_cards
487
+ Knock
488
+ Sigh
489
+ Scissors
490
+ Clang
491
+ Cowbell
492
+ Pant
493
+ Electric_toothbrush
494
+ Jackhammer
495
+ Busy_signal
496
+ Drawer_open_or_close
497
+ Whir
498
+ Power_windows,_electric_windows
499
+ Crack
500
+ Whale_vocalization
501
+ Zing
502
+ Stomach_rumble
503
+ Wail,_moan
504
+ Bouncing
505
+ Pulse
506
+ Foghorn
507
+ Bicycle_bell
508
+ Sniff
509
+ Chirp_tone
510
+ Squeak
511
+ Cupboard_open_or_close
512
+ Sanding
513
+ Sidetone
514
+ Wheeze
515
+ Squawk
516
+ Squeal
517
+ Splinter
518
+ Dental_drill,_dentist's_drill
519
+ Finger_snapping
520
+ Chopping_(food)
521
+ Tuning_fork
522
+ Gargling
523
+ Pulleys
524
+ Toothbrush
525
+ Creak
526
+ Crushing
527
+ Hoot
528
+ <blank>
529
+ <unk>
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/RESULTS.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_cls_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue May 6 06:06:59 CDT 2025`
5
+ - python version: `3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 17:20:25) [GCC 12.3.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.6.0.dev20241210+cu124`
8
+ - Git hash: `c7e589a3608814d4a78ebe29147f10e31358795d`
9
+ - Commit date: `Wed Apr 23 17:56:52 2025 -0500`
10
+
11
+ ## cls_earlarge2.final
12
+ |Split|mean_acc|mAP|mean_auc|n_labels|n_instances|
13
+ |---|---|---|---|---|---|
14
+ cls_eval|50.78|42.17|93.60|527.00|20123.00
15
+
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/config.yaml ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge2.final/conf/ear_large/audioset2m.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
9
+ ngpu: 0
10
+ seed: 0
11
+ num_workers: 2
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: null
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: true
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 25
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - valid
45
+ - epoch_mAP
46
+ - max
47
+ keep_nbest_models: 1
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 1
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 2
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: false
57
+ log_interval: null
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: true
62
+ wandb_project: audioverse
63
+ wandb_id: null
64
+ wandb_entity: shikhar
65
+ wandb_name: audioset2m.earlarge2.final
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: null
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 20000000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ train_shape_file:
83
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/speech_shape
84
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/label_shape
85
+ valid_shape_file:
86
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/speech_shape
87
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/label_shape
88
+ batch_type: length_weighted
89
+ valid_batch_type: length
90
+ fold_length:
91
+ - 160000
92
+ - 600
93
+ sort_in_batch: descending
94
+ shuffle_within_batch: false
95
+ sort_batch: descending
96
+ multiple_iterator: false
97
+ utt2weight_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/audioset2m/utt2weight
98
+ chunk_length: 500
99
+ chunk_shift_ratio: 0.5
100
+ num_cache_chunks: 1024
101
+ chunk_excluded_key_prefixes: []
102
+ chunk_default_fs: null
103
+ chunk_max_abs_length: null
104
+ chunk_discard_short_samples: true
105
+ train_data_path_and_name_and_type:
106
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/wav.scp
107
+ - speech
108
+ - sound
109
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/text
110
+ - label
111
+ - text
112
+ valid_data_path_and_name_and_type:
113
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/wav.scp
114
+ - speech
115
+ - sound
116
+ - - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/text
117
+ - label
118
+ - text
119
+ multi_task_dataset: false
120
+ allow_variable_data_keys: false
121
+ max_cache_size: 0.0
122
+ max_cache_fd: 32
123
+ allow_multi_rates: false
124
+ valid_max_cache_size: null
125
+ exclude_weight_decay: false
126
+ exclude_weight_decay_conf: {}
127
+ optim: adamw
128
+ optim_conf:
129
+ lr: 0.0001
130
+ weight_decay: 0.01
131
+ betas:
132
+ - 0.9
133
+ - 0.98
134
+ scheduler: cosineannealingwarmuprestarts
135
+ scheduler_conf:
136
+ first_cycle_steps: 400000
137
+ warmup_steps: 10000
138
+ max_lr: 0.0001
139
+ min_lr: 5.0e-06
140
+ lightning_conf:
141
+ log_every_n_steps: 250
142
+ max_epochs: 25
143
+ strategy: ddp
144
+ strategy_conf:
145
+ find_unused_parameters: true
146
+ best_model_criterion:
147
+ - - valid/epoch_mAP
148
+ - max
149
+ - 1
150
+ devices: 4
151
+ num_nodes: 1
152
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
153
+ token_list:
154
+ - Music
155
+ - Speech
156
+ - Vehicle
157
+ - Musical_instrument
158
+ - Inside,_small_room
159
+ - Guitar
160
+ - Plucked_string_instrument
161
+ - Singing
162
+ - Car
163
+ - Animal
164
+ - Electronic_music
165
+ - Outside,_rural_or_natural
166
+ - Outside,_urban_or_manmade
167
+ - Violin,_fiddle
168
+ - Inside,_large_room_or_hall
169
+ - Bird
170
+ - Drum
171
+ - Domestic_animals,_pets
172
+ - Dubstep
173
+ - Male_speech,_man_speaking
174
+ - Techno
175
+ - Percussion
176
+ - Engine
177
+ - Narration,_monologue
178
+ - Drum_kit
179
+ - Acoustic_guitar
180
+ - Strum
181
+ - Dog
182
+ - Boat,_Water_vehicle
183
+ - Train
184
+ - Electric_guitar
185
+ - Accelerating,_revving,_vroom
186
+ - Piano
187
+ - Child_speech,_kid_speaking
188
+ - Truck
189
+ - Keyboard_(musical)
190
+ - Crowd
191
+ - Bowed_string_instrument
192
+ - Bass_drum
193
+ - Rock_and_roll
194
+ - Motor_vehicle_(road)
195
+ - Pop_music
196
+ - Rail_transport
197
+ - Pigeon,_dove
198
+ - Water
199
+ - Female_speech,_woman_speaking
200
+ - Orchestra
201
+ - Rock_music
202
+ - Sound_effect
203
+ - Motorboat,_speedboat
204
+ - Railroad_car,_train_wagon
205
+ - Siren
206
+ - Tools
207
+ - Female_singing
208
+ - Hip_hop_music
209
+ - Silence
210
+ - Motorcycle
211
+ - Male_singing
212
+ - Brass_instrument
213
+ - Classical_music
214
+ - Snare_drum
215
+ - Inside,_public_space
216
+ - Choir
217
+ - Soundtrack_music
218
+ - House_music
219
+ - Wind
220
+ - Race_car,_auto_racing
221
+ - Heavy_metal
222
+ - Bass_guitar
223
+ - Chicken,_rooster
224
+ - Wind_noise_(microphone)
225
+ - Idling
226
+ - Medium_engine_(mid_frequency)
227
+ - Bird_vocalization,_bird_call,_bird_song
228
+ - Fowl
229
+ - Wind_instrument,_woodwind_instrument
230
+ - Exciting_music
231
+ - Country
232
+ - Laughter
233
+ - Marimba,_xylophone
234
+ - Sampler
235
+ - Emergency_vehicle
236
+ - Aircraft
237
+ - Electronica
238
+ - Ukulele
239
+ - Cello
240
+ - Bus
241
+ - Rhythm_and_blues
242
+ - Synthesizer
243
+ - Background_music
244
+ - Jazz
245
+ - Mantra
246
+ - Dance_music
247
+ - Flute
248
+ - Blues
249
+ - Effects_unit
250
+ - Electric_piano
251
+ - Cymbal
252
+ - Chirp,_tweet
253
+ - Rimshot
254
+ - Rapping
255
+ - Trance_music
256
+ - Livestock,_farm_animals,_working_animals
257
+ - Run
258
+ - Cheering
259
+ - Electronic_dance_music
260
+ - Theme_music
261
+ - Gospel_music
262
+ - Heavy_engine_(low_frequency)
263
+ - Radio
264
+ - Music_of_Latin_America
265
+ - Disco
266
+ - Glockenspiel
267
+ - Tender_music
268
+ - Punk_rock
269
+ - Funk
270
+ - Gunshot,_gunfire
271
+ - Hi-hat
272
+ - Bow-wow
273
+ - Cat
274
+ - Music_of_Asia
275
+ - Trumpet
276
+ - Car_passing_by
277
+ - Drum_and_bass
278
+ - Vehicle_horn,_car_horn,_honking
279
+ - Helicopter
280
+ - Vocal_music
281
+ - Police_car_(siren)
282
+ - Pizzicato
283
+ - Progressive_rock
284
+ - Tap
285
+ - Printer
286
+ - Video_game_music
287
+ - Music_for_children
288
+ - Clip-clop
289
+ - Ocean
290
+ - Drum_machine
291
+ - Rain
292
+ - Horse
293
+ - Power_tool
294
+ - Wood
295
+ - Lullaby
296
+ - Mallet_percussion
297
+ - Reggae
298
+ - Fire_engine,_fire_truck_(siren)
299
+ - New-age_music
300
+ - Christian_music
301
+ - Spray
302
+ - Saxophone
303
+ - Fireworks
304
+ - Skateboard
305
+ - Independent_music
306
+ - Fixed-wing_aircraft,_airplane
307
+ - Drum_roll
308
+ - Insect
309
+ - Bicycle
310
+ - Coo
311
+ - Tick-tock
312
+ - Accordion
313
+ - Scratching_(performance_technique)
314
+ - Soul_music
315
+ - Rain_on_surface
316
+ - Stream
317
+ - Rowboat,_canoe,_kayak
318
+ - Organ
319
+ - Ambient_music
320
+ - Steel_guitar,_slide_guitar
321
+ - Waves,_surf
322
+ - Distortion
323
+ - Music_of_Africa
324
+ - Trombone
325
+ - Bluegrass
326
+ - Sailboat,_sailing_ship
327
+ - Afrobeat
328
+ - Sheep
329
+ - Child_singing
330
+ - Bark
331
+ - Duck
332
+ - Salsa_music
333
+ - Quack
334
+ - Door
335
+ - Hiss
336
+ - Cluck
337
+ - Beatboxing
338
+ - Music_of_Bollywood
339
+ - Banjo
340
+ - Vibration
341
+ - Water_tap,_faucet
342
+ - Baby_cry,_infant_cry
343
+ - Mandolin
344
+ - Yip
345
+ - Flamenco
346
+ - Snoring
347
+ - Psychedelic_rock
348
+ - Opera
349
+ - Explosion
350
+ - Double_bass
351
+ - Burst,_pop
352
+ - Toilet_flush
353
+ - Conversation
354
+ - Applause
355
+ - Television
356
+ - Harmonica
357
+ - Train_horn
358
+ - Folk_music
359
+ - Wood_block
360
+ - Christmas_music
361
+ - Clarinet
362
+ - Steam
363
+ - Typing
364
+ - Crowing,_cock-a-doodle-doo
365
+ - Bleat
366
+ - Basketball_bounce
367
+ - Middle_Eastern_music
368
+ - Goat
369
+ - Roll
370
+ - Harp
371
+ - Harpsichord
372
+ - Computer_keyboard
373
+ - Grunge
374
+ - Vacuum_cleaner
375
+ - Whistling
376
+ - Drill
377
+ - Clickety-clack
378
+ - Meow
379
+ - Chatter
380
+ - Gurgling
381
+ - A_capella
382
+ - Civil_defense_siren
383
+ - Whoop
384
+ - Sewing_machine
385
+ - Bee,_wasp,_etc.
386
+ - Ambulance_(siren)
387
+ - Machine_gun
388
+ - Bell
389
+ - Honk
390
+ - Snicker
391
+ - Whimper_(dog)
392
+ - Dishes,_pots,_and_pans
393
+ - Tapping_(guitar_technique)
394
+ - White_noise
395
+ - Sad_music
396
+ - Thump,_thud
397
+ - Chainsaw
398
+ - Goose
399
+ - Bagpipes
400
+ - Oink
401
+ - Tick
402
+ - Song
403
+ - Traditional_music
404
+ - Tabla
405
+ - Scary_music
406
+ - Subway,_metro,_underground
407
+ - Rustling_leaves
408
+ - Liquid
409
+ - Ska
410
+ - Chant
411
+ - Speech_synthesizer
412
+ - Canidae,_dogs,_wolves
413
+ - Walk,_footsteps
414
+ - Mechanisms
415
+ - Chuckle,_chortle
416
+ - Traffic_noise,_roadway_noise
417
+ - Bathtub_(filling_or_washing)
418
+ - Boom
419
+ - Carnatic_music
420
+ - Fusillade
421
+ - Swing_music
422
+ - Whispering
423
+ - Lawn_mower
424
+ - Crumpling,_crinkling
425
+ - Frying_(food)
426
+ - Beep,_bleep
427
+ - Sitar
428
+ - Tire_squeal
429
+ - Whack,_thwack
430
+ - Sink_(filling_or_washing)
431
+ - Skidding
432
+ - Vibraphone
433
+ - Rub
434
+ - Waterfall
435
+ - Crackle
436
+ - Rustle
437
+ - Smash,_crash
438
+ - Hubbub,_speech_noise,_speech_babble
439
+ - Happy_music
440
+ - Electronic_tuner
441
+ - Crying,_sobbing
442
+ - Fire
443
+ - Jingle_(music)
444
+ - Fly,_housefly
445
+ - Hammond_organ
446
+ - Whoosh,_swoosh,_swish
447
+ - Hum
448
+ - Timpani
449
+ - Shout
450
+ - Clatter
451
+ - Trickle,_dribble
452
+ - Sizzle
453
+ - Electronic_organ
454
+ - Burping,_eructation
455
+ - Firecracker
456
+ - Throbbing
457
+ - Telephone
458
+ - Thunder
459
+ - Echo
460
+ - Blender
461
+ - Thunderstorm
462
+ - Frog
463
+ - Whimper
464
+ - Didgeridoo
465
+ - Environmental_noise
466
+ - Snake
467
+ - Raindrop
468
+ - Fart
469
+ - French_horn
470
+ - Slosh
471
+ - Zither
472
+ - Screaming
473
+ - Jingle,_tinkle
474
+ - Church_bell
475
+ - String_section
476
+ - Sneeze
477
+ - Jingle_bell
478
+ - Aircraft_engine
479
+ - Funny_music
480
+ - Angry_music
481
+ - Wild_animals
482
+ - Turkey
483
+ - Engine_starting
484
+ - Rattle
485
+ - Arrow
486
+ - Ringtone
487
+ - Propeller,_airscrew
488
+ - Sliding_door
489
+ - Heart_sounds,_heartbeat
490
+ - Pink_noise
491
+ - Steelpan
492
+ - Giggle
493
+ - Pig
494
+ - Buzzer
495
+ - Artillery_fire
496
+ - Splash,_splatter
497
+ - Roar
498
+ - Fire_alarm
499
+ - Hiccup
500
+ - Gobble
501
+ - Air_brake
502
+ - Chime
503
+ - Plop
504
+ - Singing_bowl
505
+ - Cattle,_bovinae
506
+ - Cutlery,_silverware
507
+ - Slap,_smack
508
+ - Cough
509
+ - Ship
510
+ - Reverberation
511
+ - Babbling
512
+ - Cacophony
513
+ - Electric_shaver,_electric_razor
514
+ - Baby_laughter
515
+ - Cricket
516
+ - Crow
517
+ - Writing
518
+ - Glass
519
+ - Howl
520
+ - Slam
521
+ - Belly_laugh
522
+ - Mechanical_fan
523
+ - Breathing
524
+ - Children_playing
525
+ - Chewing,_mastication
526
+ - Microwave_oven
527
+ - Stir
528
+ - Wedding_music
529
+ - Ping
530
+ - Sawing
531
+ - Eruption
532
+ - Steam_whistle
533
+ - Yell
534
+ - Clapping
535
+ - Alarm
536
+ - Chink,_clink
537
+ - Growling
538
+ - Roaring_cats_(lions,_tigers)
539
+ - Drip
540
+ - Air_horn,_truck_horn
541
+ - Clicking
542
+ - Toot
543
+ - Ding
544
+ - Harmonic
545
+ - Jet_engine
546
+ - Moo
547
+ - Tambourine
548
+ - Train_whistle
549
+ - Clock
550
+ - Children_shouting
551
+ - Engine_knocking
552
+ - Patter
553
+ - Ratchet,_pawl
554
+ - Filing_(rasp)
555
+ - Groan
556
+ - Telephone_bell_ringing
557
+ - Change_ringing_(campanology)
558
+ - Cap_gun
559
+ - Whistle
560
+ - Synthetic_singing
561
+ - Rodents,_rats,_mice
562
+ - Fill_(with_liquid)
563
+ - Owl
564
+ - Theremin
565
+ - Typewriter
566
+ - Gears
567
+ - Pump_(liquid)
568
+ - Caw
569
+ - Maraca
570
+ - Neigh,_whinny
571
+ - Alarm_clock
572
+ - Wind_chime
573
+ - Shuffle
574
+ - Smoke_detector,_smoke_alarm
575
+ - Train_wheels_squealing
576
+ - Purr
577
+ - Static
578
+ - Caterwaul
579
+ - Snort
580
+ - Bellow
581
+ - Tubular_bells
582
+ - Yodeling
583
+ - Keys_jangling
584
+ - Battle_cry
585
+ - Hammer
586
+ - Rumble
587
+ - Gong
588
+ - Single-lens_reflex_camera
589
+ - Flap
590
+ - Boing
591
+ - Car_alarm
592
+ - Ding-dong
593
+ - Breaking
594
+ - Sine_wave
595
+ - Telephone_dialing,_DTMF
596
+ - Whip
597
+ - Boiling
598
+ - Buzz
599
+ - Mains_hum
600
+ - Chop
601
+ - Heart_murmur
602
+ - Light_engine_(high_frequency)
603
+ - Hands
604
+ - Camera
605
+ - Humming
606
+ - Thunk
607
+ - Rattle_(instrument)
608
+ - Scrape
609
+ - Gush
610
+ - Mouse
611
+ - Air_conditioning
612
+ - Scratch
613
+ - Pour
614
+ - Coin_(dropping)
615
+ - Hair_dryer
616
+ - Chorus_effect
617
+ - Croak
618
+ - Squish
619
+ - Dial_tone
620
+ - Crunch
621
+ - Tearing
622
+ - Field_recording
623
+ - Cash_register
624
+ - Zipper_(clothing)
625
+ - Bang
626
+ - Shatter
627
+ - Biting
628
+ - Throat_clearing
629
+ - Bird_flight,_flapping_wings
630
+ - Noise
631
+ - Mosquito
632
+ - Reversing_beeps
633
+ - Sonar
634
+ - Grunt
635
+ - Shofar
636
+ - Doorbell
637
+ - Ice_cream_truck,_ice_cream_van
638
+ - Gasp
639
+ - Shuffling_cards
640
+ - Knock
641
+ - Sigh
642
+ - Scissors
643
+ - Clang
644
+ - Cowbell
645
+ - Pant
646
+ - Electric_toothbrush
647
+ - Jackhammer
648
+ - Busy_signal
649
+ - Drawer_open_or_close
650
+ - Whir
651
+ - Power_windows,_electric_windows
652
+ - Crack
653
+ - Whale_vocalization
654
+ - Zing
655
+ - Stomach_rumble
656
+ - Wail,_moan
657
+ - Bouncing
658
+ - Pulse
659
+ - Foghorn
660
+ - Bicycle_bell
661
+ - Sniff
662
+ - Chirp_tone
663
+ - Squeak
664
+ - Cupboard_open_or_close
665
+ - Sanding
666
+ - Sidetone
667
+ - Wheeze
668
+ - Squawk
669
+ - Squeal
670
+ - Splinter
671
+ - Dental_drill,_dentist's_drill
672
+ - Finger_snapping
673
+ - Chopping_(food)
674
+ - Tuning_fork
675
+ - Gargling
676
+ - Pulleys
677
+ - Toothbrush
678
+ - Creak
679
+ - Crushing
680
+ - Hoot
681
+ - <blank>
682
+ - <unk>
683
+ text_token_list: null
684
+ text_bpemodel: null
685
+ init: xavier_normal
686
+ input_size: 1
687
+ use_preprocessor: true
688
+ frontend: null
689
+ frontend_conf: {}
690
+ specaug: null
691
+ specaug_conf: {}
692
+ normalize: null
693
+ normalize_conf: {}
694
+ preencoder: null
695
+ preencoder_conf: {}
696
+ encoder: beats
697
+ encoder_conf:
698
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
699
+ beats_config:
700
+ layer_wise_gradient_decay_ratio: 0.6
701
+ encoder_layerdrop: 0.1
702
+ dropout: 0.0
703
+ use_weighted_representation: false
704
+ specaug_config:
705
+ apply_time_warp: true
706
+ apply_freq_mask: false
707
+ apply_time_mask: true
708
+ time_mask_width_ratio_range:
709
+ - 0
710
+ - 0.06
711
+ num_time_mask: 1
712
+ roll_augment: true
713
+ roll_interval: 1
714
+ text_encoder: null
715
+ text_encoder_conf: {}
716
+ embedding_fusion: null
717
+ embedding_fusion_conf: {}
718
+ decoder: linear
719
+ decoder_conf: {}
720
+ model: espnet
721
+ model_conf:
722
+ classification_type: multi-label
723
+ mixup_probability: 0.8
724
+ lsm_weight: 0.1
725
+ log_epoch_metrics: true
726
+ user_callbacks:
727
+ - mAP_logging
728
+ required:
729
+ - output_dir
730
+ - token_list
731
+ task: cls
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/lightning_logs/version_0/events.out.tfevents.1746445644.gh097.hsn.cm.delta.internal.ncsa.edu.1910608.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb620351b279fb5f4d500be02c711bbe7452e59b7cd841a112a91d2025c31367
3
+ size 289004
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/lightning_logs/version_0/hparams.yaml ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ args: !!python/object:argparse.Namespace
2
+ accum_grad: 2
3
+ adapter: lora
4
+ adapter_conf: {}
5
+ allow_multi_rates: false
6
+ allow_variable_data_keys: false
7
+ batch_bins: 20000000
8
+ batch_size: 20
9
+ batch_type: length_weighted
10
+ best_model_criterion:
11
+ - - valid
12
+ - epoch_mAP
13
+ - max
14
+ category_sample_size: 10
15
+ chunk_default_fs: null
16
+ chunk_discard_short_samples: true
17
+ chunk_excluded_key_prefixes: []
18
+ chunk_length: 500
19
+ chunk_max_abs_length: null
20
+ chunk_shift_ratio: 0.5
21
+ collect_stats: false
22
+ config: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/earlarge2.final/conf/ear_large/audioset2m.yaml
23
+ create_graph_in_tensorboard: false
24
+ cudnn_benchmark: false
25
+ cudnn_deterministic: true
26
+ cudnn_enabled: true
27
+ ddp_comm_hook: null
28
+ decoder: linear
29
+ decoder_conf: {}
30
+ deepspeed_config: null
31
+ detect_anomaly: false
32
+ dist_backend: nccl
33
+ dist_init_method: env://
34
+ dist_launcher: null
35
+ dist_master_addr: null
36
+ dist_master_port: null
37
+ dist_rank: null
38
+ dist_world_size: null
39
+ drop_last_iter: false
40
+ dry_run: false
41
+ early_stopping_criterion: !!python/tuple
42
+ - valid
43
+ - loss
44
+ - min
45
+ embedding_fusion: null
46
+ embedding_fusion_conf: {}
47
+ encoder: beats
48
+ encoder_conf:
49
+ beats_ckpt_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
50
+ beats_config:
51
+ dropout: 0.0
52
+ encoder_layerdrop: 0.1
53
+ layer_wise_gradient_decay_ratio: 0.6
54
+ roll_augment: true
55
+ roll_interval: 1
56
+ specaug_config:
57
+ apply_freq_mask: false
58
+ apply_time_mask: true
59
+ apply_time_warp: true
60
+ num_time_mask: 1
61
+ time_mask_width_ratio_range:
62
+ - 0
63
+ - 0.06
64
+ use_weighted_representation: false
65
+ exclude_weight_decay: false
66
+ exclude_weight_decay_conf: {}
67
+ fold_length:
68
+ - 160000
69
+ - 600
70
+ freeze_param: []
71
+ frontend: null
72
+ frontend_conf:
73
+ fs: 16k
74
+ grad_clip: 1
75
+ grad_clip_type: 2.0
76
+ grad_noise: false
77
+ gradient_as_bucket_view: true
78
+ ignore_init_mismatch: false
79
+ init: xavier_normal
80
+ init_param: []
81
+ input_size: 1
82
+ iterator_type: sequence
83
+ keep_nbest_models: 1
84
+ lightning_conf:
85
+ best_model_criterion:
86
+ - - valid/epoch_mAP
87
+ - max
88
+ - 1
89
+ default_root_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
90
+ devices: 4
91
+ log_every_n_steps: 250
92
+ max_epochs: 25
93
+ num_nodes: 1
94
+ strategy: ddp
95
+ strategy_conf:
96
+ find_unused_parameters: true
97
+ local_rank: null
98
+ log_interval: null
99
+ log_level: INFO
100
+ max_cache_fd: 32
101
+ max_cache_size: 0.0
102
+ max_epoch: 25
103
+ model: espnet
104
+ model_conf:
105
+ classification_type: multi-label
106
+ log_epoch_metrics: true
107
+ lsm_weight: 0.1
108
+ mixup_probability: 0.8
109
+ multi_task_dataset: false
110
+ multiple_iterator: false
111
+ multiprocessing_distributed: false
112
+ nbest_averaging_interval: 0
113
+ ngpu: 0
114
+ no_forward_run: false
115
+ normalize: null
116
+ normalize_conf: {}
117
+ num_att_plot: 0
118
+ num_cache_chunks: 1024
119
+ num_iters_per_epoch: null
120
+ num_workers: 2
121
+ optim: adamw
122
+ optim_conf:
123
+ betas:
124
+ - 0.9
125
+ - 0.98
126
+ lr: 0.0001
127
+ weight_decay: 0.01
128
+ output_dir: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2.final
129
+ patience: null
130
+ preencoder: null
131
+ preencoder_conf: {}
132
+ pretrain_path: null
133
+ print_config: false
134
+ required:
135
+ - output_dir
136
+ - token_list
137
+ resume: true
138
+ save_strategy: all
139
+ scheduler: cosineannealingwarmuprestarts
140
+ scheduler_conf:
141
+ first_cycle_steps: 400000
142
+ max_lr: 0.0001
143
+ min_lr: 5.0e-06
144
+ warmup_steps: 10000
145
+ seed: 0
146
+ sharded_ddp: false
147
+ shuffle_within_batch: false
148
+ sort_batch: descending
149
+ sort_in_batch: descending
150
+ specaug: null
151
+ specaug_conf: {}
152
+ task: cls
153
+ text_bpemodel: null
154
+ text_encoder: null
155
+ text_encoder_conf: {}
156
+ text_token_list: null
157
+ token_list: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/audioset2m/token_list
158
+ train_data_path_and_name_and_type:
159
+ - !!python/tuple
160
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/wav.scp
161
+ - speech
162
+ - sound
163
+ - !!python/tuple
164
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/train/text
165
+ - label
166
+ - text
167
+ train_dtype: float32
168
+ train_shape_file:
169
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/speech_shape
170
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/train/label_shape
171
+ unused_parameters: true
172
+ use_adapter: false
173
+ use_amp: false
174
+ use_deepspeed: false
175
+ use_matplotlib: true
176
+ use_preprocessor: true
177
+ use_tensorboard: true
178
+ use_tf32: false
179
+ use_wandb: true
180
+ user_callbacks:
181
+ - mAP_logging
182
+ utt2weight_file: /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/data/audioset2m/utt2weight
183
+ val_scheduler_criterion: !!python/tuple
184
+ - valid
185
+ - loss
186
+ valid_batch_bins: null
187
+ valid_batch_size: null
188
+ valid_batch_type: length
189
+ valid_data_path_and_name_and_type:
190
+ - !!python/tuple
191
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/wav.scp
192
+ - speech
193
+ - sound
194
+ - !!python/tuple
195
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/dump/audioset2m/eval/text
196
+ - label
197
+ - text
198
+ valid_iterator_type: null
199
+ valid_max_cache_size: null
200
+ valid_shape_file:
201
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/speech_shape
202
+ - /work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_stats_16k/valid/label_shape
203
+ wandb_entity: shikhar
204
+ wandb_id: null
205
+ wandb_model_log_interval: -1
206
+ wandb_name: audioset2m.earlarge2.final
207
+ wandb_project: audioverse
208
+ write_collected_feats: false
work/nvme/bbjs/sbharadwaj/espnet/egs2/audioverse/v1/exp/audioset2m/cls_earlarge2/valid.epoch_mAP.ave_1best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed8c67f57701425d69d3e5622544dd65b66addf6cb195a53a638df79f6291dcc
3
+ size 1247832682