{"id":"https://openalex.org/W7162694769","doi":"https://doi.org/10.48550/arxiv.2605.28480","title":"Audio-Mind: An Auditable Agentic Framework for Audio Understanding","display_name":"Audio-Mind: An Auditable Agentic Framework for Audio Understanding","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162694769","doi":"https://doi.org/10.48550/arxiv.2605.28480"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.28480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.28480","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137300961","display_name":"Yucheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yucheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137201762","display_name":"Jing Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137258273","display_name":"Hanqi Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hanqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100624785","display_name":"Chenghao Wang","orcid":"https://orcid.org/0000-0003-2359-5775"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chenghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137307393","display_name":"Wenming Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Wenming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137303626","display_name":"Yu Xi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xi, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137248717","display_name":"Zhaokai Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Zhaokai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137204617","display_name":"Kai Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137203308","display_name":"Shuai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.5188999772071838,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.5188999772071838,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.1688999980688095,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.13580000400543213,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.6985999941825867},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6524999737739563},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5799000263214111},{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.3824999928474426},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.3725000023841858},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.36250001192092896},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.323199987411499}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7190999984741211},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.6985999941825867},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6524999737739563},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5799000263214111},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.3824999928474426},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3725000023841858},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.36410000920295715},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.36250001192092896},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.35659998655319214},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C112313634","wikidata":"https://www.wikidata.org/wiki/Q7886648","display_name":"Complement (music)","level":5,"score":0.3073999881744385},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3028999865055084},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.2939000129699707},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.28380000591278076},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2533999979496002}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.28480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.28480","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28480","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.49690505862236023,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Audio":[0],"agents":[1],"extend":[2],"large":[3],"audio-language":[4],"models":[5],"(LALMs)":[6],"by":[7],"decomposing":[8],"audio":[9,41,56,119],"questions":[10,82],"into":[11],"tool":[12,31,66,150],"calls,":[13],"intermediate":[14],"evidence,":[15,151],"and":[16,48,90,104,152,164],"iterative":[17],"reasoning":[18,145],"steps.":[19],"However,":[20],"as":[21],"LALMs":[22],"become":[23,124],"stronger,":[24],"the":[25,129,134],"key":[26],"challenge":[27],"shifts":[28],"from":[29],"enabling":[30],"use":[32],"to":[33],"determining":[34],"when":[35,71,128],"agentic":[36,121],"evidence":[37,53,73,80,85],"acquisition":[38,54],"genuinely":[39],"benefits":[40],"understanding.":[42,57],"We":[43],"propose":[44],"Audio-Mind,":[45],"an":[46,125],"auditable":[47,144],"pluggable":[49],"framework":[50],"for":[51,81,159],"conditional":[52],"in":[55],"Audio-Mind":[58,94,141],"dynamically":[59],"combines":[60],"a":[61,156],"strong":[62,118],"frontend":[63,69],"with":[64,83],"planner-guided":[65],"use,":[67],"preserving":[68],"judgment":[70],"initial":[72],"is":[74],"sufficient":[75],"while":[76],"acquiring":[77],"bounded":[78],"external":[79],"unresolved":[84],"gaps.":[86],"Experiments":[87],"on":[88,102,107],"MMAR":[89,103],"MSU-Bench":[91],"show":[92],"that":[93,147],"outperforms":[95],"prior":[96],"audio-agent":[97],"baselines,":[98],"reaching":[99],"80.4%":[100],"accuracy":[101,106],"82.8%":[105],"MSU-Bench.":[108],"A":[109],"matched-backbone":[110],"comparison":[111],"highlights":[112],"why":[113],"this":[114],"design":[115],"matters:":[116],"under":[117],"frontends,":[120],"decomposition":[122],"can":[123],"orchestration":[126],"bottleneck":[127],"workflow":[130],"does":[131],"not":[132],"preserve":[133],"frontend's":[135],"holistic":[136],"audio-grounded":[137],"judgment.":[138],"Beyond":[139],"accuracy,":[140],"produces":[142],"higher-quality,":[143],"traces":[146],"expose":[148],"uncertainty,":[149],"answer":[153],"rationales,":[154],"offering":[155],"potential":[157],"basis":[158],"more":[160],"reliable":[161],"audio-QA":[162],"annotation":[163],"error":[165],"analysis.":[166]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-29T00:00:00"}
