{"id":"https://openalex.org/W7133210090","doi":"https://doi.org/10.48550/arxiv.2602.23649","title":"AudioCapBench: Quick Evaluation on Audio Captioning across Sound, Music, and Speech","display_name":"AudioCapBench: Quick Evaluation on Audio Captioning across Sound, Music, and Speech","publication_year":2026,"publication_date":"2026-02-27","ids":{"openalex":"https://openalex.org/W7133210090","doi":"https://doi.org/10.48550/arxiv.2602.23649"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.23649","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126703107","display_name":"Jielin Qiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Qiu, Jielin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112613156","display_name":"Jianguo Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jianguo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127870978","display_name":"Zixiang Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zixiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127811952","display_name":"Liangwei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Liangwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127852168","display_name":"Ming Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125376603","display_name":"Juntao Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Juntao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101905560","display_name":"Haolin Chen","orcid":"https://orcid.org/0000-0002-9639-4846"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Haolin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127811139","display_name":"Wenting Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Wenting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112976037","display_name":"Rithesh Murthy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Murthy, Rithesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127801165","display_name":"Roshan Ram","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ram, Roshan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075651511","display_name":"Akshara Prabhakar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prabhakar, Akshara","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063103006","display_name":"Shelby Heinecke","orcid":"https://orcid.org/0000-0002-8831-0753"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heinecke, Shelby","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127846847","display_name":"Caiming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Caiming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127851731","display_name":"Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127809820","display_name":"Silvio Savarese","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Savarese, Silvio","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127859486","display_name":"Huan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Huan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":16,"corresponding_author_ids":["https://openalex.org/A5126703107"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6388000249862671,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6388000249862671,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.04769999906420708,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.03350000083446503,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.9718000292778015},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8126000165939331},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.438400000333786},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.29010000824928284}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.9718000292778015},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8126000165939331},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7633000016212463},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5139999985694885},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4715999960899353},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.438400000333786},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3871000111103058},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2637999951839447},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.23929999768733978}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.23649","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.23649","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.23649","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.23649","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,35,118],"introduce":[1],"AudioCapBench,":[2],"a":[3],"benchmark":[4,121],"for":[5],"evaluating":[6],"audio":[7,18,130],"captioning":[8,89,112],"capabilities":[9],"of":[10,68,74],"large":[11],"multimodal":[12],"models.":[13],"\\method":[14],"covers":[15],"three":[16,60],"distinct":[17],"domains,":[19],"including":[20],"environmental":[21],"sound,":[22],"music,":[23],"and":[24,52,71,113],"speech,":[25],"with":[26,91],"1,000":[27],"curated":[28],"evaluation":[29,125],"samples":[30],"drawn":[31],"from":[32],"established":[33],"datasets.":[34],"evaluate":[36],"13":[37],"models":[38,82,86,101,107],"across":[39],"two":[40],"providers":[41],"(OpenAI,":[42],"Google":[43],"Gemini)":[44],"using":[45],"both":[46],"reference-based":[47],"metrics":[48],"(METEOR,":[49],"BLEU,":[50],"ROUGE-L)":[51],"an":[53],"LLM-as-Judge":[54],"framework":[55],"that":[56,80],"scores":[57],"predictions":[58],"on":[59,87,110,115],"orthogonal":[61],"dimensions:":[62],"\\textit{accuracy}":[63],"(semantic":[64],"correctness),":[65],"\\textit{completeness}":[66],"(coverage":[67],"reference":[69],"content),":[70],"\\textit{hallucination}":[72],"(absence":[73],"fabricated":[75],"content).":[76],"Our":[77],"results":[78],"reveal":[79],"Gemini":[81],"generally":[83],"outperform":[84],"OpenAI":[85,100],"overall":[88,96],"quality,":[90],"Gemini~3~Pro":[92],"achieving":[93],"the":[94,120],"highest":[95],"score":[97],"(6.00/10),":[98],"while":[99],"exhibit":[102],"lower":[103],"hallucination":[104],"rates.":[105],"All":[106],"perform":[108],"best":[109],"speech":[111],"worst":[114],"music":[116],"captioning.":[117],"release":[119],"as":[122,124],"well":[123],"code":[126],"to":[127],"facilitate":[128],"reproducible":[129],"understanding":[131],"research.":[132]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-03T00:00:00"}
