{"id":"https://openalex.org/W7140168601","doi":"https://doi.org/10.48550/arxiv.2603.20808","title":"Predictive Regularization Against Visual Representation Degradation in Multimodal Large Language Models","display_name":"Predictive Regularization Against Visual Representation Degradation in Multimodal Large Language Models","publication_year":2026,"publication_date":"2026-03-21","ids":{"openalex":"https://openalex.org/W7140168601","doi":"https://doi.org/10.48550/arxiv.2603.20808"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.20808","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20808","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.20808","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wang, Enguang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Enguang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Qiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wu, Yuanchen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yuanchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yan, Ke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Ke","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yuan, Xinbin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Xinbin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ding, Shouhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Shouhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Xialei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xialei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Cheng, Ming-Ming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Ming-Ming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9805999994277954,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9805999994277954,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0017000000225380063,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11148","display_name":"Language, Metaphor, and Cognition","score":0.0013000000035390258,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.6262999773025513},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5041000247001648},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.46860000491142273},{"id":"https://openalex.org/keywords/fidelity","display_name":"Fidelity","score":0.4440000057220459},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.3977999985218048},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.3197999894618988},{"id":"https://openalex.org/keywords/degradation","display_name":"Degradation (telecommunications)","score":0.29159998893737793}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6894000172615051},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.6262999773025513},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6025999784469604},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5041000247001648},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.46860000491142273},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.4440000057220459},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3977999985218048},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37279999256134033},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3197999894618988},{"id":"https://openalex.org/C2779679103","wikidata":"https://www.wikidata.org/wiki/Q5251805","display_name":"Degradation (telecommunications)","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2913999855518341},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2840999960899353},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2768000066280365},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C164280684","wikidata":"https://www.wikidata.org/wiki/Q5529040","display_name":"Gaze-contingency paradigm","level":4,"score":0.2720000147819519},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.20808","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20808","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.20808","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20808","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6624495387077332}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"Multimodal":[1],"Large":[2],"Language":[3],"Models":[4],"(MLLMs)":[5],"excel":[6],"at":[7],"vision-language":[8,148],"tasks,":[9],"the":[10,48,52,56,81,86,129,134,151],"cost":[11],"of":[12,59,133,154],"their":[13],"language-driven":[14],"training":[15],"on":[16],"internal":[17,136,157],"visual":[18,37,50,53,77,90,110,125,131,144,158],"foundational":[19],"competence":[20],"remains":[21],"unclear.":[22],"In":[23],"this":[24,73,143],"paper,":[25],"we":[26,43],"conduct":[27],"a":[28,34,63,76,100],"detailed":[29],"diagnostic":[30],"analysis":[31],"to":[32,47,75,92,117,122],"unveil":[33],"pervasive":[35],"issue:":[36],"representation":[38,54],"degradation":[39,64,145],"in":[40,55,65],"MLLMs.":[41],"Specifically,":[42],"find":[44],"that":[45,99,141],"compared":[46],"initial":[49,124],"features,":[51,126],"middle":[57],"layers":[58],"LLM":[60],"exhibits":[61],"both":[62,104],"global":[66],"function":[67],"and":[68,108,112],"patch":[69],"structure.":[70],"We":[71,97],"attribute":[72],"phenomenon":[74],"sacrifice":[78],"driven":[79],"by":[80],"singular":[82],"text-generation":[83],"objective,":[84],"where":[85],"model":[87],"compromises":[88],"its":[89],"fidelity":[91],"optimize":[93],"for":[94,162],"answer":[95],"generation.":[96],"argue":[98],"robust":[101,156],"MLLM":[102],"requires":[103],"strong":[105],"cross-modal":[106],"reasoning":[107],"core":[109],"competence,":[111],"propose":[113],"Predictive":[114],"Regularization":[115],"(PRe)":[116],"force":[118],"degraded":[119],"intermediate":[120],"features":[121],"predict":[123],"thereby":[127],"maintaining":[128],"inherent":[130],"attributes":[132],"MLLM's":[135],"representations.":[137],"Extensive":[138],"experiments":[139],"confirm":[140],"mitigating":[142],"effectively":[146],"boosts":[147],"performance,":[149],"underscoring":[150],"critical":[152],"importance":[153],"fostering":[155],"representations":[159],"within":[160],"MLLMs":[161],"comprehensive":[163],"multimodal":[164],"understanding.":[165]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
