{"id":"https://openalex.org/W4415709634","doi":"https://doi.org/10.1109/icme59968.2025.11210080","title":"VidCtx: Context-aware Video Question Answering with Image Models","display_name":"VidCtx: Context-aware Video Question Answering with Image Models","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415709634","doi":"https://doi.org/10.1109/icme59968.2025.11210080"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11210080","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210080","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5031110708","display_name":"Andreas Goulas","orcid":null},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Andreas Goulas","raw_affiliation_strings":["CERTH-ITI &amp; Queen Mary University of London,Thessaloniki,Greece"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CERTH-ITI &amp; Queen Mary University of London,Thessaloniki,Greece","institution_ids":["https://openalex.org/I166337079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059096027","display_name":"Vasileios Mezaris","orcid":"https://orcid.org/0000-0002-0121-4364"},"institutions":[{"id":"https://openalex.org/I4210093649","display_name":"Information Technologies Institute","ror":"https://ror.org/0069akp70","country_code":"GR","type":"nonprofit","lineage":["https://openalex.org/I4210093649"]},{"id":"https://openalex.org/I4210134249","display_name":"Centre for Research and Technology Hellas","ror":"https://ror.org/03bndpq63","country_code":"GR","type":"facility","lineage":["https://openalex.org/I4210134249"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Vasileios Mezaris","raw_affiliation_strings":["CERTH-ITI,Thessaloniki,Greece"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"CERTH-ITI,Thessaloniki,Greece","institution_ids":["https://openalex.org/I4210093649","https://openalex.org/I4210134249"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031205865","display_name":"Ioannis Patras","orcid":"https://orcid.org/0000-0003-3913-4738"},"institutions":[{"id":"https://openalex.org/I166337079","display_name":"Queen Mary University of London","ror":"https://ror.org/026zzn846","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I166337079"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Ioannis Patras","raw_affiliation_strings":["Queen Mary University of London,London,UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Queen Mary University of London,London,UK","institution_ids":["https://openalex.org/I166337079"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.27386982,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.0005000000237487257,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0005000000237487257,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.5916000008583069},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5860000252723694},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.548799991607666},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.4934999942779541},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4912000000476837},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4603999853134155},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.4302000105381012}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8144999742507935},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.5916000008583069},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5860000252723694},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.548799991607666},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5162000060081482},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.4934999942779541},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4912000000476837},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4603999853134155},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.4302000105381012},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4178999960422516},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.400299996137619},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3986999988555908},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36640000343322754},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3375000059604645},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.31540000438690186},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.30390000343322754},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2906999886035919},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.2784999907016754},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26249998807907104}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11210080","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210080","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334322","display_name":"HORIZON EUROPE Framework Programme","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W3175961224","https://openalex.org/W4389519587","https://openalex.org/W4389520282","https://openalex.org/W4390873732","https://openalex.org/W4390889724","https://openalex.org/W4402670892","https://openalex.org/W4402671548","https://openalex.org/W4402683949","https://openalex.org/W4402727142","https://openalex.org/W4402727624","https://openalex.org/W4402981195","https://openalex.org/W4403760933","https://openalex.org/W4404724811","https://openalex.org/W4404770348","https://openalex.org/W4404781975","https://openalex.org/W4404784276","https://openalex.org/W4412888622","https://openalex.org/W4413145669"],"related_works":[],"abstract_inverted_index":{"To":[0,67,167],"address":[1,68],"computational":[2],"and":[3,26,56,93,159,209,239],"memory":[4],"limitations":[5],"of":[6,64,96,127,163,177,206,215],"Large":[7,31,112],"Multimodal":[8,113],"Models":[9],"in":[10,44,71,106],"the":[11,40,47,101,107,137,145,157,161,175,192,198,203,207],"Video":[12,234],"Question-Answering":[13],"task,":[14],"several":[15],"recent":[16],"methods":[17],"extract":[18,119],"textual":[19,62,94,124],"representations":[20],"per":[21],"frame":[22],"(e.g.,":[23],"by":[24],"captioning)":[25],"feed":[27],"them":[28,37],"to":[29,38,53,59,118,143,190,200,211],"a":[30,77,110,153,181,212],"Language":[32],"Model":[33,114],"(LLM)":[34],"that":[35,99,219,226],"processes":[36],"produce":[39],"final":[41],"response.":[42],"However,":[43],"this":[45,72],"way,":[46],"LLM":[48],"does":[49],"not":[50],"have":[51],"access":[52],"visual":[54,88],"information":[55,89],"often":[57],"has":[58],"process":[60],"repetitive":[61],"descriptions":[63,95,125,176],"nearby":[65],"frames.":[66,129,179,216],"those":[69],"shortcomings,":[70],"paper,":[73],"we":[74,171],"introduce":[75],"VidCtx,":[76],"novel":[78],"training-free":[79],"VideoQA":[80],"framework":[81,109],"which":[82],"integrates":[83],"both":[84,87],"modalities,":[85],"i.e.":[86],"from":[90],"input":[91,151],"frames":[92,98],"others":[97],"give":[100],"appropriate":[102,165],"context.":[103],"More":[104],"specifically,":[105],"proposed":[108],"pre-trained":[111],"(LMM)":[115],"is":[116,188,243],"prompted":[117,142],"at":[120,147,245],"regular":[121],"intervals,":[122],"question-aware":[123],"(captions)":[126],"video":[128,208],"Those":[130],"will":[131,140],"be":[132,141],"used":[133,189],"as":[134,150,173],"context":[135,174],"when":[136],"same":[138],"LMM":[139],"answer":[144],"question":[146,158],"hand":[148],"given":[149],"a)":[152],"certain":[154],"frame,":[155],"b)":[156],"c)":[160],"context/caption":[162],"an":[164],"frame.":[166],"avoid":[168],"redundant":[169],"information,":[170],"chose":[172],"distant":[178],"Finally,":[180],"simple":[182],"yet":[183],"effective":[184],"max":[185],"pooling":[186],"mechanism":[187],"aggregate":[191],"frame-level":[193],"decisions.":[194],"This":[195],"methodology":[196],"enables":[197],"model":[199],"focus":[201],"on":[202,228,231],"relevant":[204],"segments":[205],"scale":[210],"high":[213],"number":[214],"Experiments":[217],"show":[218],"VidCtx":[220],"achieves":[221],"competitive":[222],"performance":[223],"among":[224],"approaches":[225],"rely":[227],"open":[229],"models":[230],"three":[232],"public":[233],"QA":[235],"benchmarks,":[236],"NExT-QA,":[237],"IntentQA":[238],"STAR.":[240],"Our":[241],"code":[242],"available":[244],"https://github.com/IDT-ITI/VidCtx.":[246]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-30T00:00:00"}
