{"id":"https://openalex.org/W7160319041","doi":"https://doi.org/10.48550/arxiv.2605.02735","title":"Visual Latents Know More Than They Say: Unsilencing Latent Reasoning in MLLMs","display_name":"Visual Latents Know More Than They Say: Unsilencing Latent Reasoning in MLLMs","publication_year":2026,"publication_date":"2026-05-04","ids":{"openalex":"https://openalex.org/W7160319041","doi":"https://doi.org/10.48550/arxiv.2605.02735"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.02735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.02735","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135409142","display_name":"Xin Zhang","orcid":"https://orcid.org/0009-0009-7159-3476"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101366839","display_name":"Qiqi Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Qiqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135324772","display_name":"Jiawei Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Jiawei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016553220","display_name":"Moyun Liu","orcid":"https://orcid.org/0000-0002-4530-2606"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Moyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135309934","display_name":"Joey Tianyi Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Joey Tianyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.002099999925121665,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0010000000474974513,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.598800003528595},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.5946999788284302},{"id":"https://openalex.org/keywords/probabilistic-latent-semantic-analysis","display_name":"Probabilistic latent semantic analysis","score":0.5616000294685364},{"id":"https://openalex.org/keywords/latent-variable","display_name":"Latent variable","score":0.45410001277923584},{"id":"https://openalex.org/keywords/latent-semantic-analysis","display_name":"Latent semantic analysis","score":0.4424000084400177},{"id":"https://openalex.org/keywords/deductive-reasoning","display_name":"Deductive reasoning","score":0.4392000138759613},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4196000099182129},{"id":"https://openalex.org/keywords/qualitative-reasoning","display_name":"Qualitative reasoning","score":0.35749998688697815}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7179999947547913},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.598800003528595},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.5946999788284302},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5659000277519226},{"id":"https://openalex.org/C112933361","wikidata":"https://www.wikidata.org/wiki/Q2845258","display_name":"Probabilistic latent semantic analysis","level":2,"score":0.5616000294685364},{"id":"https://openalex.org/C51167844","wikidata":"https://www.wikidata.org/wiki/Q4422623","display_name":"Latent variable","level":2,"score":0.45410001277923584},{"id":"https://openalex.org/C170133592","wikidata":"https://www.wikidata.org/wiki/Q1806883","display_name":"Latent semantic analysis","level":2,"score":0.4424000084400177},{"id":"https://openalex.org/C97364631","wikidata":"https://www.wikidata.org/wiki/Q484284","display_name":"Deductive reasoning","level":2,"score":0.4392000138759613},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4196000099182129},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.35749998688697815},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3492000102996826},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34790000319480896},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.328900009393692},{"id":"https://openalex.org/C65965080","wikidata":"https://www.wikidata.org/wiki/Q1806885","display_name":"Latent variable model","level":3,"score":0.3174999952316284},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C115086926","wikidata":"https://www.wikidata.org/wiki/Q17004651","display_name":"Causal reasoning","level":3,"score":0.314300000667572},{"id":"https://openalex.org/C58024561","wikidata":"https://www.wikidata.org/wiki/Q207721","display_name":"Latent heat","level":2,"score":0.3107999861240387},{"id":"https://openalex.org/C86827895","wikidata":"https://www.wikidata.org/wiki/Q7098582","display_name":"Opportunistic reasoning","level":4,"score":0.30790001153945923},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C103057564","wikidata":"https://www.wikidata.org/wiki/Q4751139","display_name":"Analytic reasoning","level":3,"score":0.28369998931884766},{"id":"https://openalex.org/C107848011","wikidata":"https://www.wikidata.org/wiki/Q4680756","display_name":"Adaptive reasoning","level":4,"score":0.26840001344680786},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2624000012874603},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.25940001010894775},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2558000087738037}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.02735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.02735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.02735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Continuous":[0],"latent-space":[1],"reasoning":[2,22,36,79,102,135,162,188],"offers":[3],"a":[4,27,140],"compact":[5],"alternative":[6],"to":[7,17,48,152],"textual":[8],"chain-of-thought":[9],"for":[10],"multimodal":[11],"models,":[12],"enabling":[13],"high-dimensional":[14],"visual":[15,35,39,68,113,191],"evidence":[16],"be":[18],"integrated":[19],"without":[20,180],"explicit":[21],"tokens.":[23],"However,":[24],"we":[25,91],"identify":[26],"previously":[28],"overlooked":[29],"optimization":[30],"pathology":[31],"in":[32],"existing":[33],"latent":[34,71,101,128,134,150,161,178],"methods:":[37],"although":[38],"latents":[40,114],"become":[41,153],"semantically":[42],"enriched":[43],"during":[44],"training,":[45],"their":[46],"contribution":[47],"final":[49],"answer":[50],"prediction":[51],"is":[52,136],"systematically":[53],"suppressed.":[54],"Within":[55],"the":[56,60,93,100,133,149,160,186],"shared":[57],"parameter":[58,182],"space,":[59],"autoregressive":[61],"objective":[62],"favors":[63],"shortcut":[64],"reliance":[65],"on":[66],"direct":[67],"input,":[69],"driving":[70],"tokens":[72],"toward":[73],"transition-like":[74],"states":[75],"rather":[76,163],"than":[77,164],"informative":[78],"content.":[80],"We":[81],"term":[82],"this":[83],"phenomenon":[84],"Silenced":[85],"Visual":[86],"Latents.":[87],"To":[88],"address":[89],"it,":[90],"disentangle":[92],"two":[94],"conflicting":[95],"objectives":[96],"by":[97],"directly":[98],"optimizing":[99],"at":[103],"inference":[104],"time,":[105],"keeping":[106],"backbone":[107],"parameters":[108],"frozen.":[109],"In":[110,130],"Stage":[111,131],"I,":[112],"are":[115],"warmed":[116],"up":[117],"via":[118,139],"query-guided":[119],"contrastive":[120],"latent--visual":[121],"alignment,":[122],"improving":[123],"semantic":[124],"quality":[125],"while":[126],"preventing":[127],"collapse.":[129],"II,":[132],"further":[137],"optimized":[138],"confidence-progression":[141],"reward,":[142],"which":[143],"incentivizes":[144],"predicted":[145],"token":[146],"distributions":[147],"along":[148],"span":[151],"progressively":[154],"more":[155],"concentrated,":[156],"routing":[157],"predictions":[158],"through":[159],"bypassing":[165],"it.":[166],"Experiments":[167],"across":[168],"eight":[169],"benchmarks":[170],"and":[171],"four":[172],"model":[173],"backbones":[174],"show":[175],"that":[176],"inference-time":[177],"optimization,":[179],"any":[181],"updates,":[183],"effectively":[184],"unleashes":[185],"suppressed":[187],"capacity":[189],"of":[190],"latents.":[192]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-06T00:00:00"}
