{"id":"https://openalex.org/W4387968051","doi":"https://doi.org/10.1145/3581783.3613909","title":"Language-Guided Visual Aggregation Network for Video Question Answering","display_name":"Language-Guided Visual Aggregation Network for Video Question Answering","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387968051","doi":"https://doi.org/10.1145/3581783.3613909"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3613909","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3613909","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101407402","display_name":"Liang Xiao","orcid":"https://orcid.org/0000-0003-0382-2715"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiao Liang","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0003-0382-2715","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100401389","display_name":"Di Wang","orcid":"https://orcid.org/0000-0001-8027-4287"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Di Wang","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0001-8027-4287","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Quan Wang","orcid":"https://orcid.org/0000-0002-3410-9560"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Quan Wang","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0002-3410-9560","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Bo Wan","orcid":"https://orcid.org/0000-0001-6913-8604"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Wan","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0001-6913-8604","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102970747","display_name":"Lingling An","orcid":"https://orcid.org/0000-0002-0103-489X"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingling An","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0002-0103-489X","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039837634","display_name":"Lihuo He","orcid":"https://orcid.org/0000-0002-0555-3574"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lihuo He","raw_affiliation_strings":["Xidian University, Xian, China"],"raw_orcid":"https://orcid.org/0000-0002-0555-3574","affiliations":[{"raw_affiliation_string":"Xidian University, Xian, China","institution_ids":["https://openalex.org/I149594827"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4491,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.6457804,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"5195","last_page":"5203"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9957000017166138,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8547227382659912},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6524256467819214},{"id":"https://openalex.org/keywords/merge","display_name":"Merge (version control)","score":0.5215746760368347},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.46226418018341064},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.45553162693977356},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.45262643694877625},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4518853425979614},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.4326803684234619},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4154638648033142},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3953017592430115},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.2984396815299988}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8547227382659912},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6524256467819214},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.5215746760368347},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.46226418018341064},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.45553162693977356},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.45262643694877625},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4518853425979614},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.4326803684234619},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4154638648033142},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3953017592430115},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2984396815299988},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3581783.3613909","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3581783.3613909","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.6100000143051147,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2277195237","https://openalex.org/W2606982687","https://openalex.org/W2765716052","https://openalex.org/W2886641317","https://openalex.org/W2954199749","https://openalex.org/W2962949233","https://openalex.org/W2998166190","https://openalex.org/W3168640669","https://openalex.org/W3175961224","https://openalex.org/W3204588463","https://openalex.org/W3206675006","https://openalex.org/W3207847779","https://openalex.org/W4251247712","https://openalex.org/W4285191490","https://openalex.org/W4285531802","https://openalex.org/W4285606530","https://openalex.org/W4286696412","https://openalex.org/W4304084136","https://openalex.org/W4312246181","https://openalex.org/W4312480274","https://openalex.org/W4312954271","https://openalex.org/W4312974690","https://openalex.org/W4313071966","https://openalex.org/W4386076615"],"related_works":["https://openalex.org/W2384605597","https://openalex.org/W2387743295","https://openalex.org/W2115758952","https://openalex.org/W3082787378","https://openalex.org/W2136007095","https://openalex.org/W2366230879","https://openalex.org/W3208425359","https://openalex.org/W2349927912","https://openalex.org/W3159777597","https://openalex.org/W4234886518"],"abstract_inverted_index":{"Video":[0],"Question":[1],"Answering":[2],"(VideoQA)":[3],"aims":[4],"to":[5,25,55,78,121,142,162,173,181],"comprehend":[6],"intricate":[7],"relationships,":[8],"actions,":[9],"and":[10,23,86,106,109,126,149,165,188,193,205,216],"events":[11],"within":[12],"video":[13,89,170,203],"content,":[14],"as":[15,17,73],"well":[16],"the":[18,33,130,135,146,169,174,183,211],"inherent":[19],"links":[20],"between":[21,185],"objects":[22],"scenes,":[24],"answer":[26,113,189],"text-based":[27],"questions":[28],"accurately.":[29],"Transferring":[30],"knowledge":[31],"from":[32],"cross-modal":[34],"pre-trained":[35],"model":[36],"CLIP":[37,72],"is":[38,140,160,179,227],"a":[39,64,99],"natural":[40],"approach,":[41],"but":[42],"its":[43],"dual-tower":[44],"structure":[45],"hinders":[46],"fine-grained":[47],"modality":[48],"interaction,":[49],"posing":[50],"challenges":[51],"for":[52,153],"direct":[53],"application":[54],"VideoQA":[56],"tasks.":[57,195],"To":[58],"address":[59],"this":[60],"issue,":[61],"we":[62],"introduce":[63],"Language-Guided":[65],"Visual":[66],"Aggregation":[67],"(LGVA)":[68],"network.":[69],"It":[70],"employs":[71,118],"an":[74],"effective":[75],"feature":[76],"extractor":[77],"obtain":[79],"language-aligned":[80],"visual":[81,96,187],"features":[82],"with":[83],"different":[84],"granularities":[85],"avoids":[87],"resource-intensive":[88],"pre-training.":[90],"The":[91],"LGVA":[92],"network":[93],"progressively":[94],"aggregates":[95],"information":[97],"in":[98,134],"bottom-up":[100],"manner,":[101],"focusing":[102],"on":[103,210],"both":[104],"regional":[105],"temporal":[107],"levels,":[108],"ultimately":[110],"facilitating":[111],"accurate":[112],"prediction.":[114],"More":[115],"specifically,":[116],"it":[117],"local":[119],"cross-attention":[120,159],"combine":[122],"pre-extracted":[123],"question":[124],"tokens":[125],"region":[127],"embeddings,":[128,167,190],"pinpointing":[129],"object":[131],"of":[132],"interest":[133],"question.":[136,175],"Then,":[137],"graph":[138],"attention":[139],"utilized":[141],"aggregate":[143],"regions":[144],"at":[145,229],"frame":[147],"level":[148],"integrate":[150],"additional":[151],"captions":[152],"enhanced":[154],"detail.":[155],"Following":[156],"this,":[157],"global":[158],"used":[161],"merge":[163],"sentence":[164],"frame-level":[166],"identifying":[168],"segment":[171],"relevant":[172],"Ultimately,":[176],"contrastive":[177],"learning":[178],"applied":[180],"optimize":[182],"similarities":[184],"aggregated":[186],"unifying":[191],"upstream":[192],"downstream":[194],"Our":[196,225],"method":[197],"conserves":[198],"resources":[199],"by":[200],"avoiding":[201],"large-scale":[202],"pre-training":[204],"simultaneously":[206],"demonstrates":[207],"commendable":[208],"performance":[209],"NExT-QA,":[212],"MSVD-QA,":[213],"MSRVTT-QA,":[214],"TGIF-QA,":[215],"ActivityNet-QA":[217],"datasets,":[218],"even":[219],"outperforming":[220],"some":[221],"end-to-end":[222],"trained":[223],"models.":[224],"code":[226],"available":[228],"https://github.com/ecoxial2007/LGVA_VideoQA.":[230]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
