{"id":"https://openalex.org/W7164831396","doi":"https://doi.org/10.1145/3805622.3810622","title":"Teaching Audio-Language Models to Reason over Time","display_name":"Teaching Audio-Language Models to Reason over Time","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164831396","doi":"https://doi.org/10.1145/3805622.3810622"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810622","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810622","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810622","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027459729","display_name":"Yan Xu","orcid":"https://orcid.org/0000-0003-0379-8866"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yufeng Xu","raw_affiliation_strings":["College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-8533-285X","affiliations":[{"raw_affiliation_string":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072369703","display_name":"Yunjia Li","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunjia Li","raw_affiliation_strings":["College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0003-7100-2214","affiliations":[{"raw_affiliation_string":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138656752","display_name":"Hai Wang","orcid":"https://orcid.org/0009-0005-6535-6352"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]},{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hai Wang","raw_affiliation_strings":["School of Computer Science, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0005-6535-6352","affiliations":[{"raw_affiliation_string":"School of Computer Science, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915","https://openalex.org/I36399199"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100318193","display_name":"Wei Li","orcid":"https://orcid.org/0000-0002-4486-8341"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China and Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-4486-8341","affiliations":[{"raw_affiliation_string":"College of Computer Science and Artificial Intelligence, Fudan University, Shanghai, China and Shanghai Key Laboratory of Intelligent Information Processing, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94276052,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1073","last_page":"1082"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6704999804496765,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6704999804496765,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.066600002348423,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.04769999906420708,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.65420001745224},{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.6207000017166138},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5806999802589417},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.5138999819755554},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5078999996185303},{"id":"https://openalex.org/keywords/temporal-resolution","display_name":"Temporal resolution","score":0.5037999749183655},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.4383000135421753},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.4043000042438507}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7469000220298767},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.65420001745224},{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.6207000017166138},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5806999802589417},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.548799991607666},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.5138999819755554},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5078999996185303},{"id":"https://openalex.org/C119666444","wikidata":"https://www.wikidata.org/wiki/Q5977280","display_name":"Temporal resolution","level":2,"score":0.5037999749183655},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4607999920845032},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.4383000135421753},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.4043000042438507},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.36059999465942383},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.3571999967098236},{"id":"https://openalex.org/C77277458","wikidata":"https://www.wikidata.org/wiki/Q1969246","display_name":"Temporal database","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.29899999499320984},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.29440000653266907},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.287200003862381},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28380000591278076},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.2605000138282776},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810622","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810622","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810622","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810622","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6956659555435181,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W2038484192","https://openalex.org/W2566935005","https://openalex.org/W2593116425","https://openalex.org/W4205689591","https://openalex.org/W4210913346","https://openalex.org/W4400033239","https://openalex.org/W4402230126","https://openalex.org/W4410087135","https://openalex.org/W4411120149","https://openalex.org/W4416035826"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Audio-Language":[1],"Models":[2],"(LALMs)":[3],"have":[4],"achieved":[5],"remarkable":[6],"progress":[7],"in":[8,118,160,203,225],"general":[9],"audio":[10,75,86,226],"understanding.":[11],"Nevertheless,":[12],"they":[13],"still":[14],"exhibit":[15],"significant":[16],"limitations":[17,113],"when":[18],"reasoning":[19,139],"about":[20],"the":[21,35,44,71,112,137,198,216],"complex":[22,120],"temporal":[23,72,96,121,204,227],"relationships":[24],"between":[25],"sound":[26],"events.":[27],"This":[28,131],"bottleneck":[29],"arises":[30],"from":[31],"two":[32],"key":[33],"problems:":[34],"lack":[36],"of":[37,46,74,85,114,201],"a":[38,47,64,125,142,148],"systematic":[39],"benchmark":[40,68],"for":[41,51],"evaluation":[42],"and":[43,87,100,110,158,188],"absence":[45],"reasoning-oriented":[48],"training":[49],"paradigm":[50],"audio.":[52],"To":[53,103],"resolve":[54],"this,":[55],"we":[56,123,172],"introduce":[57,173],"TASC-Bench":[58,80,217],"(Temporal":[59],"Audio":[60],"Scene":[61],"Comprehension":[62],"Benchmark),":[63],"large-scale":[65],"question-answering":[66],"(QA)":[67],"focused":[69],"on":[70],"understanding":[73],"scenes,":[76],"built":[77],"upon":[78],"AudioSet.":[79],"comprises":[81],"over":[82,108],"320":[83],"hours":[84],"9.5":[88],"million":[89],"QA":[90],"pairs,":[91],"covering":[92],"seven":[93],"dimensions,":[94],"including":[95],"boundaries,":[97],"event":[98],"durations,":[99],"inter-event":[101],"relations.":[102],"teach":[104],"LALMs":[105,202],"to":[106,150,155,185,221],"reason":[107],"time":[109],"overcome":[111],"Supervised":[115],"Fine-Tuning":[116],"(SFT)":[117],"tackling":[119],"reasoning,":[122,157,205],"propose":[124],"cross-modal":[126],"Chain-of-Thought":[127],"(CoT)":[128],"distillation":[129],"strategy.":[130],"progressive":[132],"approach":[133],"begins":[134],"by":[135],"bootstrapping":[136],"model\u2019s":[138],"capabilities":[140],"using":[141],"large":[143],"language":[144],"model":[145],"(LLM)":[146],"as":[147],"teacher":[149],"generate":[151],"CoT,":[152],"then":[153],"transitions":[154],"self-driven":[156],"culminates":[159],"aligning":[161],"its":[162],"behavior":[163],"with":[164],"preference":[165],"data.":[166],"For":[167],"this":[168],"final":[169],"alignment":[170],"stage,":[171],"our":[174,195,207],"improved":[175],"Equilibrated":[176],"Chain":[177],"Direct":[178],"Preference":[179],"Optimization":[180],"(EC-DPO)":[181],"method":[182,208],"alongside":[183],"SFT":[184],"mitigate":[186],"hallucinations":[187],"enhance":[189],"stability.":[190],"Our":[191],"experiments":[192],"demonstrate":[193],"that":[194],"dataset":[196,218],"reveals":[197],"current":[199],"shortcomings":[200],"while":[206],"significantly":[209],"enhances":[210],"these":[211],"capabilities.":[212],"We":[213],"will":[214],"make":[215],"publicly":[219],"available":[220],"foster":[222],"further":[223],"research":[224],"reasoning.":[228]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
