{"id":"https://openalex.org/W7162645559","doi":"https://doi.org/10.48550/arxiv.2605.28741","title":"Self-Prophetic Decoding to Unlock Visual Search in LVLMs","display_name":"Self-Prophetic Decoding to Unlock Visual Search in LVLMs","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162645559","doi":"https://doi.org/10.48550/arxiv.2605.28741"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.28741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.28741","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137297857","display_name":"Zhendong He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Zhendong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101339724","display_name":"Qiyuan Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Qiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137236031","display_name":"Guanbin Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guanbin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137273682","display_name":"Liang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Liang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137224587","display_name":"Sibei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Sibei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.944100022315979,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.944100022315979,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.01360000018030405,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.009600000455975533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7652000188827515},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.70660001039505},{"id":"https://openalex.org/keywords/visual-search","display_name":"Visual search","score":0.6615999937057495},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.555899977684021},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.4758000075817108},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4228000044822693},{"id":"https://openalex.org/keywords/interference","display_name":"Interference (communication)","score":0.39070001244544983}],"concepts":[{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7652000188827515},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7350000143051147},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.70660001039505},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.6615999937057495},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.555899977684021},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.491100013256073},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.4758000075817108},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4228000044822693},{"id":"https://openalex.org/C32022120","wikidata":"https://www.wikidata.org/wiki/Q797225","display_name":"Interference (communication)","level":3,"score":0.39070001244544983},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.37119999527931213},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.36149999499320984},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3384000062942505},{"id":"https://openalex.org/C125583679","wikidata":"https://www.wikidata.org/wiki/Q755673","display_name":"Search algorithm","level":2,"score":0.3221000134944916},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2994000017642975},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.29249998927116394},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27230000495910645},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2644999921321869},{"id":"https://openalex.org/C19889080","wikidata":"https://www.wikidata.org/wiki/Q2835852","display_name":"Beam search","level":3,"score":0.25369998812675476}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.28741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.28741","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28741","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Vision-Language":[1],"Models":[2],"(LVLMs)":[3],"are":[4],"rapidly":[5],"evolving":[6],"toward":[7],"true":[8],"multimodal":[9],"reasoning,":[10],"with":[11],"visual":[12,24,150],"search":[13,25,151],"representing":[14],"a":[15,82,91,116,131],"concrete":[16],"instantiation":[17],"of":[18,63,148],"the":[19,59,64,86,94],"thinking-with-images":[20],"paradigm.":[21],"However,":[22],"LVLM":[23],"faces":[26],"two":[27,48],"key":[28],"challenges:":[29],"incompatibility":[30],"among":[31],"intrinsic":[32,60,122],"capabilities":[33,62,124],"after":[34],"post-training,":[35],"and":[36,55,71,93],"interference":[37],"in":[38,130],"long":[39],"multi-step":[40,107,128],"reasoning":[41,129],"contexts.":[42],"To":[43],"address":[44],"these,":[45],"we":[46,113],"identify":[47],"novel":[49],"insights.":[50],"First,":[51],"self-regulation":[52],"between":[53],"pre-":[54],"post-training":[56,95],"LVLMs":[57,143],"leverages":[58,121],"single-step":[61,123],"pre-training":[65,87],"model":[66,88,96],"to":[67,125,165],"mitigate":[68],"capability":[69],"deterioration":[70],"long-context":[72],"interference.":[73],"Second,":[74],"probability-based":[75],"prophetic":[76,99,168],"sampling,":[77],"replacing":[78],"naive":[79],"prompting,":[80],"provides":[81],"probabilistic":[83],"interface":[84],"where":[85],"acts":[89],"as":[90,153,155],"prophet":[92],"selectively":[97],"accepts":[98],"tokens":[100],"under":[101],"its":[102,166],"output":[103],"distribution,":[104],"preserving":[105],"coherent":[106,127],"reasoning.":[108],"Building":[109],"on":[110],"these":[111],"insights,":[112],"introduce":[114],"SeProD,":[115],"self-prophetic":[117],"decoding":[118],"framework":[119],"that":[120,137],"enable":[126],"training-free,":[132],"plug-and-play":[133],"manner.":[134],"Experiments":[135],"show":[136],"SeProD":[138],"consistently":[139],"improves":[140],"multiple":[141],"visual-search":[142],"across":[144,156],"all":[145],"12":[146],"splits":[147],"4":[149],"benchmarks,":[152,159],"well":[154],"general":[157],"VQA":[158],"without":[160],"added":[161],"computational":[162],"overhead,":[163],"thanks":[164],"parallel":[167],"acceptance":[169],"mechanism.":[170]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-29T00:00:00"}
