{"id":"https://openalex.org/W6910842274","doi":"https://doi.org/10.48550/arxiv.2507.08236","title":"Distilling Spectrograms into Tokens: Fast and Lightweight Bioacoustic Classification for BirdCLEF+ 2025","display_name":"Distilling Spectrograms into Tokens: Fast and Lightweight Bioacoustic Classification for BirdCLEF+ 2025","publication_year":2025,"publication_date":"2025-07-11","ids":{"openalex":"https://openalex.org/W6910842274","doi":"https://doi.org/10.48550/arxiv.2507.08236"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2507.08236","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.08236","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2507.08236","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Miyaguchi, Anthony","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Miyaguchi, Anthony","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gustineli, Murilo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gustineli, Murilo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Cheung, Adrian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheung, Adrian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11665","display_name":"Animal Vocal Communication and Behavior","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1309","display_name":"Developmental Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11665","display_name":"Animal Vocal Communication and Behavior","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1309","display_name":"Developmental Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.000699999975040555,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10332","display_name":"Amphibian and Reptile Biology","score":9.999999747378752e-05,"subfield":{"id":"https://openalex.org/subfields/2306","display_name":"Global and Planetary Change"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.8184000253677368},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6180999875068665},{"id":"https://openalex.org/keywords/bioacoustics","display_name":"Bioacoustics","score":0.6179999709129333},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5996999740600586},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.39649999141693115},{"id":"https://openalex.org/keywords/fuzz-testing","display_name":"Fuzz testing","score":0.39089998602867126},{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.37610000371932983},{"id":"https://openalex.org/keywords/word2vec","display_name":"Word2vec","score":0.3734999895095825}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.8184000253677368},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7008000016212463},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6180999875068665},{"id":"https://openalex.org/C34951282","wikidata":"https://www.wikidata.org/wiki/Q864191","display_name":"Bioacoustics","level":2,"score":0.6179999709129333},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5996999740600586},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5525000095367432},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.42250001430511475},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.39649999141693115},{"id":"https://openalex.org/C111065885","wikidata":"https://www.wikidata.org/wiki/Q1189053","display_name":"Fuzz testing","level":3,"score":0.39089998602867126},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.37610000371932983},{"id":"https://openalex.org/C2776461190","wikidata":"https://www.wikidata.org/wiki/Q22673982","display_name":"Word2vec","level":3,"score":0.3734999895095825},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3723999857902527},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3544999957084656},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C8880873","wikidata":"https://www.wikidata.org/wiki/Q187787","display_name":"Genetic algorithm","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2921000123023987},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C139502532","wikidata":"https://www.wikidata.org/wiki/Q1122090","display_name":"Computational intelligence","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2727000117301941}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2507.08236","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.08236","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2507.08236","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2507.08236","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,99],"BirdCLEF+":[1],"2025":[2],"challenge":[3],"requires":[4],"classifying":[5],"206":[6],"species,":[7],"including":[8],"birds,":[9],"mammals,":[10],"insects,":[11],"and":[12,80,93,113,153,180,210],"amphibians,":[13],"from":[14,51,102],"soundscape":[15],"recordings":[16],"under":[17],"a":[18,63,82,108,114,122,135,167,175,183,187,195,203,211],"strict":[19],"90-minute":[20],"CPU-only":[21],"inference":[22,66,189],"deadline,":[23],"making":[24],"many":[25],"state-of-the-art":[26],"deep":[27],"learning":[28],"approaches":[29,222],"impractical.":[30],"To":[31],"address":[32],"this":[33,232],"constraint,":[34],"the":[35,52,69,89,96,103,199,217],"DS@GT":[36],"BirdCLEF":[37],"team":[38],"explored":[39],"two":[40],"strategies.":[41],"First,":[42],"we":[43,61,120],"establish":[44],"competitive":[45],"baselines":[46],"by":[47,147],"optimizing":[48],"pre-trained":[49],"models":[50],"Bioacoustics":[53],"Model":[54],"Zoo":[55],"for":[56,68,159,194,226,231],"CPU":[57],"inference.":[58],"Using":[59],"TFLite,":[60],"achieved":[62,202],"nearly":[64],"10x":[65],"speedup":[67],"Perch":[70],"model,":[71],"enabling":[72],"it":[73],"to":[74,182],"run":[75],"in":[76,162],"approximately":[77],"16":[78],"minutes":[79,193],"achieve":[81],"final":[83,204],"ROC-AUC":[84,205],"score":[85,110,116,207,213],"of":[86,111,117,191,208,214,219],"0.729":[87],"on":[88,95],"public":[90,109,206],"leaderboard":[91],"post-competition":[92],"0.711":[94],"private":[97,115,212],"leaderboard.":[98],"best":[100],"model":[101],"zoo":[104],"was":[105],"BirdSetEfficientNetB1,":[106],"with":[107,166,223],"0.810":[112],"0.778.":[118],"Second,":[119],"introduce":[121],"novel,":[123],"lightweight":[124],"pipeline":[125],"named":[126],"Spectrogram":[127],"Token":[128],"Skip-Gram":[129],"(STSG)":[130],"that":[131],"treats":[132],"bioacoustics":[133],"as":[134],"sequence":[136],"modeling":[137],"task.":[138],"This":[139],"method":[140],"converts":[141],"audio":[142],"into":[143],"discrete":[144],"\"spectrogram":[145],"tokens\"":[146],"clustering":[148],"Mel-spectrograms":[149],"using":[150],"Faiss":[151],"K-means":[152],"then":[154],"learns":[155],"high-quality":[156],"contextual":[157],"embeddings":[158,173,225],"these":[160],"tokens":[161],"an":[163],"unsupervised":[164],"manner":[165],"Word2Vec":[168],"skip-gram":[169],"model.":[170,185],"For":[171],"classification,":[172],"within":[174],"5-second":[176],"window":[177],"are":[178],"averaged":[179],"passed":[181],"linear":[184],"With":[186],"projected":[188],"time":[190],"6":[192],"700-minute":[196],"test":[197],"set,":[198],"STSG":[200],"approach":[201],"0.559":[209],"0.520,":[215],"demonstrating":[216],"viability":[218],"fast":[220],"tokenization":[221],"static":[224],"bioacoustic":[227],"classification.":[228],"Supporting":[229],"code":[230],"paper":[233],"can":[234],"be":[235],"found":[236],"at":[237],"https://github.com/dsgt-arc/birdclef-2025.":[238]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
