{"id":"https://openalex.org/W4416036655","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.597","title":"D-CoDe: Scaling Image-Pretrained VLMs to Video via Dynamic Compression and Question Decomposition","display_name":"D-CoDe: Scaling Image-Pretrained VLMs to Video via Dynamic Compression and Question Decomposition","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416036655","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.597"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.597","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.597","pdf_url":"https://aclanthology.org/2025.emnlp-main.597.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.597.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101492567","display_name":"Yiyang Huang","orcid":"https://orcid.org/0000-0002-1741-6649"},"institutions":[{"id":"https://openalex.org/I87182695","display_name":"Universidad del Noreste","ror":"https://ror.org/02ahky613","country_code":"MX","type":"education","lineage":["https://openalex.org/I87182695"]}],"countries":["MX"],"is_corresponding":true,"raw_author_name":"Yiyang Huang","raw_affiliation_strings":["Northeastern University"],"affiliations":[{"raw_affiliation_string":"Northeastern University","institution_ids":["https://openalex.org/I87182695"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100602395","display_name":"Yizhou Wang","orcid":"https://orcid.org/0000-0001-9888-6409"},"institutions":[{"id":"https://openalex.org/I87182695","display_name":"Universidad del Noreste","ror":"https://ror.org/02ahky613","country_code":"MX","type":"education","lineage":["https://openalex.org/I87182695"]}],"countries":["MX"],"is_corresponding":false,"raw_author_name":"Yizhou Wang","raw_affiliation_strings":["Northeastern University"],"affiliations":[{"raw_affiliation_string":"Northeastern University","institution_ids":["https://openalex.org/I87182695"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5056948852","display_name":"Yun Fu","orcid":"https://orcid.org/0000-0002-8099-4302"},"institutions":[{"id":"https://openalex.org/I87182695","display_name":"Universidad del Noreste","ror":"https://ror.org/02ahky613","country_code":"MX","type":"education","lineage":["https://openalex.org/I87182695"]}],"countries":["MX"],"is_corresponding":false,"raw_author_name":"Yun Fu","raw_affiliation_strings":["Northeastern University"],"affiliations":[{"raw_affiliation_string":"Northeastern University","institution_ids":["https://openalex.org/I87182695"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101492567"],"corresponding_institution_ids":["https://openalex.org/I87182695"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35146482,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"11809","last_page":"11822"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.09849999845027924,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.09849999845027924,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.05270000174641609,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12611","display_name":"Neural Networks and Reservoir Computing","score":0.052299998700618744,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.4853000044822693},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.47510001063346863},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.41920000314712524},{"id":"https://openalex.org/keywords/decomposition","display_name":"Decomposition","score":0.32420000433921814},{"id":"https://openalex.org/keywords/image-compression","display_name":"Image compression","score":0.28859999775886536}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5586000084877014},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.4853000044822693},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.47510001063346863},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4408000111579895},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.41920000314712524},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4140999913215637},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41029998660087585},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.32420000433921814},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3000999987125397},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.28859999775886536},{"id":"https://openalex.org/C2780500908","wikidata":"https://www.wikidata.org/wiki/Q28405357","display_name":"Dynamic scaling","level":3,"score":0.27090001106262207},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.26159998774528503}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.597","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.597","pdf_url":"https://aclanthology.org/2025.emnlp-main.597.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.597","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.597","pdf_url":"https://aclanthology.org/2025.emnlp-main.597.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416036655.pdf","grobid_xml":"https://content.openalex.org/works/W4416036655.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video":[0],"large":[1],"language":[2],"models":[3,19],"(Vid-LLMs),":[4],"which":[5],"excel":[6],"in":[7,53,154],"diverse":[8],"video-language":[9,157],"tasks,":[10],"can":[11],"be":[12],"effectively":[13,135],"constructed":[14],"by":[15,109],"adapting":[16],"image-pretrained":[17],"vision-language":[18],"(VLMs).However,":[20],"this":[21],"adaptation":[22,69],"remains":[23],"challenging,":[24],"as":[25,50],"it":[26],"requires":[27],"processing":[28],"dense":[29],"and":[30,47,75,90,127],"temporally":[31],"extended":[32],"visual":[33],"inputs":[34],"that":[35,71,133],"exceed":[36],"the":[37,44,58,81,111,117,125,145,150],"capacity":[38],"of":[39,87,93,124,152],"image-based":[40,55],"models.This":[41],"paper":[42],"identifies":[43],"perception":[45,82],"bottleneck":[46,83],"token":[48,107],"overload":[49,108],"key":[51],"challenges":[52],"extending":[54],"VLMs":[56],"to":[57,119],"video":[59,126,137],"domain.To":[60],"address":[61],"these":[62],"issues,":[63],"we":[64],"propose":[65],"D-CoDe,":[66],"a":[67],"training-free":[68],"framework":[70],"incorporates":[72],"dynamic":[73,78],"compression":[74,79],"question":[76,104],"decomposition.Specifically,":[77],"alleviates":[80],"through":[84],"adaptive":[85],"selection":[86],"representative":[88],"frames":[89],"content-aware":[91],"aggregation":[92],"spatial":[94],"tokens,":[95],"thereby":[96],"reducing":[97],"redundancy":[98],"while":[99],"preserving":[100],"informative":[101],"content.In":[102],"parallel,":[103],"decomposition":[105],"mitigates":[106],"reformulating":[110],"original":[112],"query":[113],"into":[114],"sub-questions,":[115],"guiding":[116],"model":[118],"focus":[120],"on":[121,144],"distinct":[122],"aspects":[123],"enabling":[128],"more":[129],"comprehensive":[130],"understanding.Experiments":[131],"demonstrate":[132],"D-CoDe":[134,153],"improves":[136],"understanding":[138],"across":[139],"various":[140],"benchmarks.Furthermore,":[141],"strong":[142],"performance":[143],"challenging":[146],"long-video":[147],"benchmark":[148],"highlights":[149],"potential":[151],"handling":[155],"complex":[156],"tasks.":[158]},"counts_by_year":[],"updated_date":"2026-03-13T14:20:09.374765","created_date":"2025-11-08T00:00:00"}
