{"id":"https://openalex.org/W7162556301","doi":"https://doi.org/10.48550/arxiv.2605.26680","title":"DynFrame: Adaptive Reasoning-Driven Multimodal Framework with Dynamic Frame Augmentation for Complex Video Understanding","display_name":"DynFrame: Adaptive Reasoning-Driven Multimodal Framework with Dynamic Frame Augmentation for Complex Video Understanding","publication_year":2026,"publication_date":"2026-05-26","ids":{"openalex":"https://openalex.org/W7162556301","doi":"https://doi.org/10.48550/arxiv.2605.26680"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.26680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.26680","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137182350","display_name":"Peng Zhang","orcid":"https://orcid.org/0009-0009-1903-1417"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Peng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137092481","display_name":"Guanghao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Guanghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137116185","display_name":"Wanggui He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Wanggui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137085241","display_name":"Longxiang Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Longxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039853684","display_name":"Mushui Liu","orcid":"https://orcid.org/0000-0002-2909-7702"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Mushui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137139599","display_name":"Yan Xia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137091667","display_name":"Zhenhao Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Zhenhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102266633","display_name":"Weilong Dai","orcid":"https://orcid.org/0009-0006-9288-932X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Weilong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137105988","display_name":"Jinlong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jinlong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137124583","display_name":"Haobing Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Haobing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137169024","display_name":"Le Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Le","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137127105","display_name":"Hao Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5059615376","display_name":"Pipei Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Pipei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9832000136375427,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9832000136375427,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0024999999441206455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6237999796867371},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5339999794960022},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4884999990463257},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.48089998960494995},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.4399000108242035},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.4124999940395355},{"id":"https://openalex.org/keywords/window","display_name":"Window (computing)","score":0.3865000009536743},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3862999975681305}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8116999864578247},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6237999796867371},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5339999794960022},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5264999866485596},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.48089998960494995},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.4399000108242035},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.3865000009536743},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3862999975681305},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3725000023841858},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3280999958515167},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.3253999948501587},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.32109999656677246},{"id":"https://openalex.org/C159877910","wikidata":"https://www.wikidata.org/wiki/Q2202883","display_name":"Autoregressive model","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.26440000534057617},{"id":"https://openalex.org/C3261483","wikidata":"https://www.wikidata.org/wiki/Q119565","display_name":"Frame rate","level":2,"score":0.2578999996185303}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.26680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.26680","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26680","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.49176573753356934,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"video":[1,21],"multimodal":[2],"large":[3],"language":[4],"models":[5,17],"(MLLMs)":[6],"increasingly":[7],"couple":[8],"step-by-step":[9],"reasoning":[10],"with":[11,88,153,202],"on-demand":[12],"visual":[13],"evidence":[14,64,152],"retrieval,":[15],"allowing":[16],"to":[18,50,96,102],"revisit":[19],"relevant":[20],"segments":[22],"during":[23],"inference.":[24],"However,":[25],"two":[26],"structural":[27],"gaps":[28],"remain":[29],"in":[30],"existing":[31,42],"thinking-with-video":[32],"systems.":[33],"(i)":[34],"Sampling":[35],"density":[36,136],"is":[37,57,65,112,117,200,224],"not":[38],"a":[39,61,89,126,141,154],"learnable":[40,146],"decision:":[41],"methods":[43],"may":[44],"let":[45],"the":[46,53,94,100,106,115,130,134,160,176,186,190,194],"model":[47],"decide":[48],"where":[49],"look,":[51],"but":[52],"per-window":[54],"frame":[55],"rate":[56],"largely":[58],"fixed.":[59],"As":[60],"result,":[62],"fine-grained":[63],"often":[66],"recovered":[67],"through":[68],"repeated":[69],"retrieval":[70,148,156,163,177],"calls,":[71],"which":[72,171],"increases":[73],"inference":[74],"context":[75],"length":[76],"and":[77,82,99,114,133,179,189,197,215],"training":[78],"difficulty.":[79],"(ii)":[80],"Retrieval":[81],"answer":[83],"generation":[84],"are":[85],"usually":[86],"optimized":[87],"single":[90,142,155],"trajectory-level":[91],"advantage,":[92],"so":[93],"\"where":[95],"look\"":[97],"tokens":[98,104,139],"\"how":[101],"answer\"":[103],"receive":[105],"same":[107],"credit":[108],"even":[109],"when":[110],"one":[111],"correct":[113],"other":[116],"not.":[118],"To":[119],"address":[120],"these":[121],"gaps,":[122],"we":[123,165],"present":[124],"DynFrame,":[125],"framework":[127],"that":[128],"emits":[129],"temporal":[131],"window":[132],"sampling":[135,187],"as":[137],"native":[138],"within":[140],"autoregressive":[143],"pass.":[144],"This":[145],"span-density":[147],"enables":[149],"acquiring":[150],"multi-granularity":[151],"step.":[157],"Based":[158],"on":[159,193,220],"above":[161],"tokenized":[162],"interface,":[164],"further":[166],"introduce":[167],"Segment-Decoupled":[168],"GRPO":[169],"(SD-GRPO),":[170],"splits":[172],"each":[173],"rollout":[174],"at":[175,226],"boundary":[178],"assigns":[180],"role-specific":[181],"token-level":[182],"advantages,":[183],"separately":[184],"crediting":[185],"decision":[188],"answer.":[191],"Trained":[192],"curated":[195],"DM-CoT-74k":[196],"DM-RL-45k,":[198],"DynFrame-4B":[199],"competitive":[201],"strong":[203],"7B-8B":[204],"baselines":[205],"across":[206],"six":[207],"benchmarks":[208],"(NExT-GQA,":[209],"Charades-STA,":[210],"ActivityNet-MR,":[211],"Video-MME,":[212],"MLVU,":[213],"LVBench),":[214],"DynFrame-8B":[216],"sets":[217],"new":[218],"state-of-the-art":[219],"most":[221],"metrics.":[222],"Code":[223],"available":[225],"https://github.com/zhangguanghao523/DynFrame.":[227]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-28T00:00:00"}
