{"id":"https://openalex.org/W7164804496","doi":"https://doi.org/10.1145/3805622.3810845","title":"A Unified Object-Centric Spatio-Temporal Graph Reasoning Framework for Audio-Visual Question Answering","display_name":"A Unified Object-Centric Spatio-Temporal Graph Reasoning Framework for Audio-Visual Question Answering","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164804496","doi":"https://doi.org/10.1145/3805622.3810845"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810845","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810845","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810845","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045082466","display_name":"Feifei Xu","orcid":"https://orcid.org/0000-0001-8044-5912"},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feifei Xu","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-8044-5912","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125099685","display_name":"Wenjing Zhu","orcid":null},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjing Zhu","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-6928-4141","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100459326","display_name":"Dongyang Li","orcid":"https://orcid.org/0000-0003-0818-2526"},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongyang Li","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-5956-9143","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125186379","display_name":"Puzhe Li","orcid":null},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Puzhe Li","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0000-1275-3454","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079864048","display_name":"Luobin Huang","orcid":"https://orcid.org/0009-0006-4031-5148"},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Luobin Huang","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0001-9994-1382","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138659438","display_name":"Yu Xie","orcid":"https://orcid.org/0009-0008-8740-2374"},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Xie","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0008-8740-2374","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121819504","display_name":"Zirui Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I23632641","display_name":"Shanghai University of Electric Power","ror":"https://ror.org/02w4tny03","country_code":"CN","type":"education","lineage":["https://openalex.org/I23632641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zirui Xu","raw_affiliation_strings":["Shanghai University of Electric Power, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0008-9259-6345","affiliations":[{"raw_affiliation_string":"Shanghai University of Electric Power, Shanghai, China","institution_ids":["https://openalex.org/I23632641"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93336108,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1006","last_page":"1014"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9351000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9351000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.02239999920129776,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.01209999993443489,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8003000020980835},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.46549999713897705},{"id":"https://openalex.org/keywords/non-monotonic-logic","display_name":"Non-monotonic logic","score":0.37229999899864197},{"id":"https://openalex.org/keywords/circumscription","display_name":"Circumscription","score":0.3346000015735626},{"id":"https://openalex.org/keywords/knowledge-representation-and-reasoning","display_name":"Knowledge representation and reasoning","score":0.3172999918460846}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8003000020980835},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6141999959945679},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5121999979019165},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.46549999713897705},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.37229999899864197},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3671000003814697},{"id":"https://openalex.org/C62360110","wikidata":"https://www.wikidata.org/wiki/Q96777007","display_name":"Circumscription","level":2,"score":0.3346000015735626},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.3172999918460846},{"id":"https://openalex.org/C2777152325","wikidata":"https://www.wikidata.org/wiki/Q108163","display_name":"Proposition","level":2,"score":0.2946999967098236},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C2987255567","wikidata":"https://www.wikidata.org/wiki/Q33002955","display_name":"Knowledge graph","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810845","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810845","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810845","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810845","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2526050071","https://openalex.org/W2593116425","https://openalex.org/W2980339970","https://openalex.org/W2997344006","https://openalex.org/W3034336960","https://openalex.org/W4214604251","https://openalex.org/W4304098310","https://openalex.org/W4312380001","https://openalex.org/W4382202615","https://openalex.org/W4385574085","https://openalex.org/W4386065882","https://openalex.org/W4386113246","https://openalex.org/W4386453561","https://openalex.org/W4388192054","https://openalex.org/W4390873421","https://openalex.org/W4392903494","https://openalex.org/W4393154284","https://openalex.org/W4394592978","https://openalex.org/W4394625659","https://openalex.org/W4402904188","https://openalex.org/W4403791445","https://openalex.org/W4405028135","https://openalex.org/W4409367415","https://openalex.org/W4413144295","https://openalex.org/W4415799050","https://openalex.org/W7131086145","https://openalex.org/W7133210299","https://openalex.org/W7133238718"],"related_works":[],"abstract_inverted_index":{"Audio-Visual":[0],"Question":[1],"Answering":[2],"(AVQA)":[3],"aims":[4],"to":[5,28,59,98,100,125,132,150],"answer":[6],"textual":[7],"questions":[8],"by":[9],"perceiving":[10],"and":[11,86,103,158,176,201,210],"understanding":[12],"audio-visual":[13],"information,":[14],"which":[15,94,144],"requires":[16],"not":[17],"only":[18],"effective":[19,209],"question-guided":[20],"multimodal":[21,194],"reasoning,":[22],"but":[23],"also":[24],"fine-grained":[25,88],"spatio-temporal":[26,71],"modeling":[27],"capture":[29],"dynamic":[30,170],"changes.":[31],"However,":[32],"existing":[33],"methods":[34],"mainly":[35],"operate":[36],"at":[37],"the":[38,123,169,186,199],"patch":[39],"level":[40],"with":[41,77,91,118],"strict":[42],"partitioning,":[43],"overlooking":[44],"object-level":[45],"semantic":[46],"features.":[47],"Furthermore,":[48],"direct":[49],"fusion":[50],"of":[51,172,217],"question":[52,189],"features":[53],"in":[54],"most":[55],"studies":[56],"restricts":[57],"attention":[58],"question-specific":[60],"details.":[61],"To":[62],"address":[63],"these":[64],"challenges,":[65],"we":[66,106,136],"propose":[67],"a":[68,78,108,138,147,215],"unified":[69,148],"object-centric":[70],"graph":[72,149,187],"reasoning":[73,219],"framework.":[74],"It":[75],"begins":[76],"Motion-guided":[79],"Object":[80,110],"Tracking":[81,111],"(MOT)":[82],"module":[83,113],"that":[84,114,129,205],"identifies":[85],"localizes":[87],"visual":[89,116,153],"objects":[90,117,128,173],"significant":[92],"motion,":[93],"are":[95,130],"more":[96],"likely":[97],"relate":[99],"sounding":[101],"behaviors":[102],"questions.":[104],"Next,":[105],"design":[107],"Sound-guided":[109],"(SOT)":[112],"associates":[115],"corresponding":[119],"audio":[120,159],"signals,":[121],"enabling":[122,192],"model":[124,152],"highlight":[126],"key":[127],"relevant":[131],"sound":[133],"information.":[134],"Then,":[135],"devise":[137],"Question-guided":[139],"Spatio-Temporal":[140],"Graph":[141],"(QSTG)":[142],"module,":[143],"first":[145],"constructs":[146],"jointly":[151],"object":[154,160],"nodes":[155],"across":[156,174,214],"frames":[157],"nodes.":[161],"Based":[162],"on":[163,198],"this":[164],"graph,":[165],"QSTG":[166],"continuously":[167],"captures":[168],"evolution":[171],"spatial":[175],"temporal":[177],"dimensions,":[178],"while":[179],"adaptively":[180],"modulating":[181],"information":[182],"propagation":[183],"weights":[184],"throughout":[185],"under":[188],"guidance,":[190],"thereby":[191],"refined":[193],"inference.":[195],"Extensive":[196],"experiments":[197],"MUSIC-AVQA":[200],"MUSIC-AVQA-R":[202],"datasets":[203],"demonstrate":[204],"our":[206],"framework":[207],"is":[208],"achieves":[211],"state-of-the-art":[212],"performance":[213],"variety":[216],"complex":[218],"tasks.":[220]},"counts_by_year":[],"updated_date":"2026-06-17T06:14:20.161405","created_date":"2026-06-16T00:00:00"}
