{"id":"https://openalex.org/W4416251029","doi":"https://doi.org/10.1109/waspaa66052.2025.11230965","title":"OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder","display_name":"OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder","publication_year":2025,"publication_date":"2025-10-12","ids":{"openalex":"https://openalex.org/W4416251029","doi":"https://doi.org/10.1109/waspaa66052.2025.11230965"},"language":null,"primary_location":{"id":"doi:10.1109/waspaa66052.2025.11230965","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230965","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067162151","display_name":"Shikhar Bharadwaj","orcid":"https://orcid.org/0009-0003-7202-0502"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shikhar Bharadwaj","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047682990","display_name":"Samuele Cornell","orcid":"https://orcid.org/0000-0002-5358-1844"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Samuele Cornell","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023799668","display_name":"Kwanghee Choi","orcid":"https://orcid.org/0000-0001-5254-1093"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kwanghee Choi","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003098968","display_name":"Satoru Fukayama","orcid":"https://orcid.org/0000-0001-6506-2796"},"institutions":[{"id":"https://openalex.org/I73613424","display_name":"National Institute of Advanced Industrial Science and Technology","ror":"https://ror.org/01703db54","country_code":"JP","type":"government","lineage":["https://openalex.org/I73613424"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoru Fukayama","raw_affiliation_strings":["National Institute of Advanced Industrial Science and Technology (AIST),Japan"],"affiliations":[{"raw_affiliation_string":"National Institute of Advanced Industrial Science and Technology (AIST),Japan","institution_ids":["https://openalex.org/I73613424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080292725","display_name":"Hye-jin Shim","orcid":"https://orcid.org/0000-0002-0912-6983"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hye-Jin Shim","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017946811","display_name":"Soham Deshmukh","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soham Deshmukh","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001291873","display_name":"Shinji Watanabe","orcid":"https://orcid.org/0000-0002-5970-8631"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Carnegie Mellon University,USA"],"affiliations":[{"raw_affiliation_string":"Carnegie Mellon University,USA","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5067162151"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45357686,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.29499998688697815,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.29499998688697815,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.1808999925851822,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.13420000672340393,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6712999939918518},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5992000102996826},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.576200008392334},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.538100004196167},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.44699999690055847},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4138999879360199},{"id":"https://openalex.org/keywords/audio-equipment","display_name":"Audio equipment","score":0.3774999976158142},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.37299999594688416}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7524999976158142},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6712999939918518},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6116999983787537},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5992000102996826},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.576200008392334},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.538100004196167},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.44699999690055847},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4138999879360199},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3896999955177307},{"id":"https://openalex.org/C2778488704","wikidata":"https://www.wikidata.org/wiki/Q15190726","display_name":"Audio equipment","level":2,"score":0.3774999976158142},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.37299999594688416},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3619999885559082},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.35199999809265137},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.3456000089645386},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3003999888896942},{"id":"https://openalex.org/C34951282","wikidata":"https://www.wikidata.org/wiki/Q864191","display_name":"Bioacoustics","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C9940772","wikidata":"https://www.wikidata.org/wiki/Q557399","display_name":"Psychoacoustics","level":3,"score":0.27709999680519104},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.2700999975204468},{"id":"https://openalex.org/C167310288","wikidata":"https://www.wikidata.org/wiki/Q7564808","display_name":"Sound quality","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/waspaa66052.2025.11230965","is_oa":false,"landing_page_url":"https://doi.org/10.1109/waspaa66052.2025.11230965","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2593116425","https://openalex.org/W2797977484","https://openalex.org/W2959539607","https://openalex.org/W2962780374","https://openalex.org/W2973049979","https://openalex.org/W3006926732","https://openalex.org/W3015591594","https://openalex.org/W3021013305","https://openalex.org/W3160599263","https://openalex.org/W3206996142","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4205689591","https://openalex.org/W4297841853","https://openalex.org/W4312097006","https://openalex.org/W4313014461","https://openalex.org/W4372260310","https://openalex.org/W4372262734","https://openalex.org/W4372266830","https://openalex.org/W4391021675","https://openalex.org/W4392903475","https://openalex.org/W4400033239","https://openalex.org/W4401609151","https://openalex.org/W4402112407","https://openalex.org/W4404782746","https://openalex.org/W4404784428","https://openalex.org/W4408352573","https://openalex.org/W4409362588","https://openalex.org/W4411119792"],"related_works":[],"abstract_inverted_index":{"Masked":[0],"token":[1,153],"prediction":[2,154],"has":[3,45],"emerged":[4],"as":[5,107],"a":[6,24,135],"powerful":[7],"pre-training":[8,26,55,170],"objective":[9],"across":[10,90],"language,":[11],"vision,":[12],"and":[13,98,112,126,151,165,171,175,178],"speech,":[14],"offering":[15],"the":[16,40,51,146],"potential":[17],"to":[18,50,156],"unify":[19],"these":[20,71],"diverse":[21],"modalities":[22],"through":[23],"single":[25],"task.":[27],"However,":[28],"its":[29,65],"application":[30],"for":[31],"general":[32],"audio":[33,84,100,103,108,159],"understanding":[34],"remains":[35],"underexplored,":[36],"with":[37],"BEATs":[38,44,58,81],"being":[39],"only":[41,61],"notable":[42],"example.":[43],"seen":[46],"limited":[47],"modifications":[48],"due":[49],"absence":[52],"of":[53,93,148],"open-source":[54,77],"code.":[56],"Furthermore,":[57],"was":[59],"trained":[60],"on":[62,118],"AudioSet,":[63],"restricting":[64],"broader":[66],"downstream":[67],"applicability.":[68],"To":[69,161],"address":[70],"gaps,":[72],"we":[73,167],"present":[74],"OpenBEATs,":[75],"an":[76],"framework":[78],"that":[79],"extends":[80],"via":[82],"multi-domain":[83,149],"pre-training.":[85],"We":[86],"conduct":[87],"comprehensive":[88],"evaluations":[89],"six":[91,119],"types":[92],"tasks,":[94],"twenty":[95],"five":[96,127],"datasets,":[97,121,129],"three":[99],"domains,":[101],"including":[102],"reasoning":[104,128],"tasks":[105],"such":[106],"question":[109],"answering,":[110],"entailment,":[111],"captioning.":[113],"OpenBEATs":[114],"achieves":[115],"state-of-the-art":[116],"performance":[117],"bioacoustics":[120],"two":[122],"environmental":[123],"sound":[124],"datasets":[125,150],"performing":[130],"better":[131],"than":[132],"models":[133],"exceeding":[134],"billion":[136],"parameters":[137],"at":[138],"one-fourth":[139],"their":[140],"parameter":[141],"size.":[142],"These":[143],"results":[144],"demonstrate":[145],"effectiveness":[147],"masked":[152],"task":[155],"learn":[157],"general-purpose":[158],"representations.":[160],"promote":[162],"further":[163],"research":[164],"reproducibility,":[166],"release":[168],"all":[169],"evaluation":[172],"code,":[173],"pretrained":[174],"fine-tuned":[176],"checkpoints,":[177],"training":[179],"logs":[180],"<sup":[181],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[182],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>.":[183]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
