{"id":"https://openalex.org/W4387247768","doi":"https://doi.org/10.1109/tcsvt.2023.3318220","title":"Question-Aware Global-Local Video Understanding Network for Audio-Visual Question Answering","display_name":"Question-Aware Global-Local Video Understanding Network for Audio-Visual Question Answering","publication_year":2023,"publication_date":"2023-10-02","ids":{"openalex":"https://openalex.org/W4387247768","doi":"https://doi.org/10.1109/tcsvt.2023.3318220"},"language":"en","primary_location":{"id":"doi:10.1109/tcsvt.2023.3318220","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2023.3318220","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5059312754","display_name":"Zailong Chen","orcid":"https://orcid.org/0009-0003-8431-5471"},"institutions":[{"id":"https://openalex.org/I204824540","display_name":"University of Wollongong","ror":"https://ror.org/00jtmb277","country_code":"AU","type":"education","lineage":["https://openalex.org/I204824540"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Zailong Chen","raw_affiliation_strings":["School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia","institution_ids":["https://openalex.org/I204824540"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100435848","display_name":"Lei Wang","orcid":"https://orcid.org/0000-0002-0961-0441"},"institutions":[{"id":"https://openalex.org/I204824540","display_name":"University of Wollongong","ror":"https://ror.org/00jtmb277","country_code":"AU","type":"education","lineage":["https://openalex.org/I204824540"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Lei Wang","raw_affiliation_strings":["School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia"],"affiliations":[{"raw_affiliation_string":"School of Computing and Information Technology, University of Wollongong, Wollongong, NSW, Australia","institution_ids":["https://openalex.org/I204824540"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100396039","display_name":"Peng Wang","orcid":"https://orcid.org/0000-0002-5397-9115"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Wang","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5106667221","display_name":"Peng Gao","orcid":"https://orcid.org/0009-0005-7881-712X"},"institutions":[{"id":"https://openalex.org/I25254941","display_name":"Beijing Normal University","ror":"https://ror.org/022k4wk35","country_code":"CN","type":"education","lineage":["https://openalex.org/I25254941"]},{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]},{"id":"https://openalex.org/I12615008","display_name":"Beijing Normal University - Hong Kong Baptist University United International College","ror":"https://ror.org/04snvc712","country_code":"CN","type":"education","lineage":["https://openalex.org/I12615008"]}],"countries":["CN","HK"],"is_corresponding":false,"raw_author_name":"Peng Gao","raw_affiliation_strings":["Institute of Computer Science, Beijing Normal University&#x2013;Hong Kong Baptist University United International College, Zhuhai, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, Beijing Normal University&#x2013;Hong Kong Baptist University United International College, Zhuhai, China","institution_ids":["https://openalex.org/I12615008","https://openalex.org/I25254941","https://openalex.org/I141568987"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5059312754"],"corresponding_institution_ids":["https://openalex.org/I204824540"],"apc_list":null,"apc_paid":null,"fwci":2.346,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.90620236,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"34","issue":"5","first_page":"4109","last_page":"4119"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8193272352218628},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8118559122085571},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6389100551605225},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.630770206451416},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5856837630271912},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5485504269599915},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5272765755653381},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5253885388374329},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.5065110921859741},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4383658170700073},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3806885778903961},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32575279474258423},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.26069700717926025},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.0805223286151886}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8193272352218628},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8118559122085571},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6389100551605225},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.630770206451416},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5856837630271912},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5485504269599915},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5272765755653381},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5253885388374329},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.5065110921859741},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4383658170700073},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3806885778903961},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32575279474258423},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.26069700717926025},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0805223286151886},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tcsvt.2023.3318220","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2023.3318220","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},{"id":"pmh:oai:ro.uow.edu.au:test2021-15446","is_oa":false,"landing_page_url":"https://doi.org/10.1109/TCSVT.2023.3318220","pdf_url":null,"source":{"id":"https://openalex.org/S4306400510","display_name":"Research Online (University of Wollongong)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I204824540","host_organization_name":"University of Wollongong","host_organization_lineage":["https://openalex.org/I204824540"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Scopus Harvesting Series","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1933349210","https://openalex.org/W2593116425","https://openalex.org/W2619697695","https://openalex.org/W2842511635","https://openalex.org/W2904452845","https://openalex.org/W2905551935","https://openalex.org/W2954199749","https://openalex.org/W2963155035","https://openalex.org/W2966683369","https://openalex.org/W2980339970","https://openalex.org/W2998166190","https://openalex.org/W3002552512","https://openalex.org/W3027790991","https://openalex.org/W3034730770","https://openalex.org/W3044495139","https://openalex.org/W3119243803","https://openalex.org/W3127165192","https://openalex.org/W3162479984","https://openalex.org/W3197828817","https://openalex.org/W3205782068","https://openalex.org/W4214604251","https://openalex.org/W4221139382","https://openalex.org/W4297808394","https://openalex.org/W4304098310","https://openalex.org/W4312377093","https://openalex.org/W4312380001","https://openalex.org/W4312384316","https://openalex.org/W4313163028","https://openalex.org/W4362653417","https://openalex.org/W4385823465","https://openalex.org/W4386113246","https://openalex.org/W4403511263","https://openalex.org/W6719057275","https://openalex.org/W6773226109","https://openalex.org/W6797613833","https://openalex.org/W6845759482","https://openalex.org/W6851997505"],"related_works":["https://openalex.org/W73545470","https://openalex.org/W4224266612","https://openalex.org/W2383394264","https://openalex.org/W4320153225","https://openalex.org/W4293261942","https://openalex.org/W3125968744","https://openalex.org/W203959209","https://openalex.org/W2110287964","https://openalex.org/W2167701463","https://openalex.org/W4307407935"],"abstract_inverted_index":{"As":[0,184],"a":[1,73,95,160],"newly":[2],"emerging":[3],"task,":[4],"audio-visual":[5,164,193],"question":[6,100,129,194],"answering":[7,101,195],"(AVQA)":[8],"has":[9,67],"attracted":[10],"research":[11],"attention.":[12],"Compared":[13],"with":[14,72,197],"traditional":[15],"single-modality":[16],"(e.g.,":[17],"audio":[18,54],"or":[19],"visual)":[20],"QA":[21],"tasks,":[22],"it":[23],"poses":[24],"new":[25,161],"challenges":[26],"due":[27],"to":[28,68,82,132,153],"the":[29,39,49,60,118,127,136,140,145,155,191,198,213,217],"higher":[30],"complexity":[31],"of":[32,48,62,148,202,221],"feature":[33,65,137],"extraction":[34,66,138],"and":[35,55,86,111,114,150,172,204,208,219],"fusion":[36,147],"brought":[37],"by":[38,180],"multimodal":[40,99],"inputs.":[41],"First,":[42],"AVQA":[43,209],"requires":[44],"more":[45,63],"comprehensive":[46],"understanding":[47,166],"scene":[50,107,165],"which":[51],"involves":[52],"both":[53,109,170],"visual":[56],"information;":[57],"Second,":[58],"in":[59],"presence":[61],"information,":[64],"be":[69,83,227],"better":[70],"connected":[71],"given":[74,128],"question;":[75],"Third,":[76],"features":[77,119,152],"from":[78,120],"different":[79,121],"modalities":[80,122],"need":[81],"sufficiently":[84],"correlated":[85],"fused.":[87,125],"To":[88],"address":[89],"this":[90,92],"situation,":[91],"work":[93],"proposes":[94],"novel":[96],"framework":[97,158],"for":[98,163],"task.":[102],"It":[103],"characterises":[104],"an":[105],"audiovisual":[106],"at":[108,139],"global":[110,149],"local":[112,141,151],"levels,":[113],"within":[115],"each":[116],"level,":[117],"are":[123],"well":[124,176],"Furthermore,":[126],"is":[130],"utilised":[131],"guide":[133],"not":[134],"only":[135],"level":[142],"but":[143],"also":[144],"final":[146],"predict":[154],"answer.":[156],"Our":[157,224],"provides":[159],"perspective":[162],"through":[167],"focusing":[168],"on":[169,206],"general":[171],"specific":[173],"representations":[174],"as":[175,177],"aggregating":[178],"multimodalities":[179],"prioritizing":[181],"question-related":[182],"information.":[183],"experimentally":[185],"demonstrated,":[186],"our":[187,222],"method":[188],"significantly":[189],"improves":[190],"existing":[192],"performance,":[196],"averaged":[199],"absolute":[200],"gain":[201],"3.3%":[203],"3.1%":[205],"MUSIC-AVQA":[207],"datasets,":[210],"respectively.":[211],"Moreover,":[212],"ablation":[214],"study":[215],"verifies":[216],"necessity":[218],"effectiveness":[220],"design.":[223],"code":[225],"will":[226],"publicly":[228],"released.":[229]},"counts_by_year":[{"year":2025,"cited_by_count":13},{"year":2024,"cited_by_count":6}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
