{"id":"https://openalex.org/W7133561896","doi":"https://doi.org/10.48550/arxiv.2603.02748","title":"iGVLM: Dynamic Instruction-Guided Vision Encoding for Question-Aware Multimodal Understanding","display_name":"iGVLM: Dynamic Instruction-Guided Vision Encoding for Question-Aware Multimodal Understanding","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133561896","doi":"https://doi.org/10.48550/arxiv.2603.02748"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.02748","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071582484","display_name":"Hanpeng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Hanpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128096638","display_name":"Yaqian Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yaqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128054643","display_name":"Zidan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zidan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005741692","display_name":"Shuoxi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shuoxi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120826306","display_name":"Zihao Bo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bo, Zihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128042636","display_name":"Rinyoichi Takezoe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takezoe, Rinyoichi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128063568","display_name":"Kaiwen Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Kaiwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128095372","display_name":"Kun He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Kun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5071582484"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9639000296592712,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.005400000140070915,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.004900000058114529,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.5443999767303467},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5299000144004822},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.49880000948905945},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4691999852657318},{"id":"https://openalex.org/keywords/affine-transformation","display_name":"Affine transformation","score":0.45649999380111694},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.41100001335144043},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.4018000066280365},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.3441999852657318}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7006999850273132},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5982000231742859},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.5443999767303467},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5299000144004822},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.49880000948905945},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4691999852657318},{"id":"https://openalex.org/C92757383","wikidata":"https://www.wikidata.org/wiki/Q382497","display_name":"Affine transformation","level":2,"score":0.45649999380111694},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.41100001335144043},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.4018000066280365},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3441999852657318},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3109000027179718},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.26019999384880066},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2587999999523163},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.25859999656677246},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.2558000087738037}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.02748","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.02748","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02748","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.02748","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.7448237538337708,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,108],"success":[2],"of":[3,113],"Large":[4],"Vision--Language":[5],"Models":[6],"(LVLMs),":[7],"most":[8],"existing":[9],"architectures":[10],"suffer":[11],"from":[12,100],"a":[13,54,63,67,80,97,123,149],"representation":[14,69],"bottleneck:":[15],"they":[16],"rely":[17],"on":[18],"static,":[19],"instruction-agnostic":[20],"vision":[21],"encoders":[22],"whose":[23],"visual":[24,43,59,74,115],"representations":[25,75],"are":[26,45],"utilized":[27],"in":[28],"an":[29],"invariant":[30],"manner":[31],"across":[32,144],"different":[33],"textual":[34],"tasks.":[35],"This":[36,94],"rigidity":[37],"hinders":[38],"fine-grained":[39],"reasoning":[40,105],"where":[41],"task-specific":[42],"cues":[44],"critical.":[46],"To":[47],"address":[48],"this":[49],"issue,":[50],"we":[51,120],"propose":[52],"iGVLM,":[53],"general":[55],"framework":[56],"for":[57,127,152],"instruction-guided":[58],"modulation.":[60],"iGVLM":[61,139],"introduces":[62],"decoupled":[64],"dual-branch":[65],"architecture:":[66],"frozen":[68],"branch":[70,83],"that":[71,84,138],"preserves":[72],"task-agnostic":[73],"learned":[76],"during":[77],"pre-training,":[78],"and":[79,111,156],"dynamic":[81],"conditioning":[82],"performs":[85],"affine":[86],"feature":[87],"modulation":[88],"via":[89],"Adaptive":[90],"Layer":[91],"Normalization":[92],"(AdaLN).":[93],"design":[95],"enables":[96],"smooth":[98],"transition":[99],"general-purpose":[101],"perception":[102,155],"to":[103],"instruction-aware":[104],"while":[106],"maintaining":[107],"structural":[109],"integrity":[110],"stability":[112],"pre-trained":[114],"priors.":[116],"Beyond":[117],"standard":[118],"benchmarks,":[119],"introduce":[121],"MM4,":[122],"controlled":[124],"diagnostic":[125],"probe":[126],"quantifying":[128],"logical":[129],"consistency":[130],"under":[131],"multi-query,":[132],"multi-instruction":[133],"settings.":[134],"Extensive":[135],"results":[136],"show":[137],"consistently":[140],"enhances":[141],"instruction":[142],"sensitivity":[143],"diverse":[145],"language":[146],"backbones,":[147],"offering":[148],"plug-and-play":[150],"paradigm":[151],"bridging":[153],"passive":[154],"active":[157],"reasoning.":[158]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-05T00:00:00"}
