{"id":"https://openalex.org/W7128679511","doi":"https://doi.org/10.48550/arxiv.2602.11073","title":"Chatting with Images for Introspective Visual Thinking","display_name":"Chatting with Images for Introspective Visual Thinking","publication_year":2026,"publication_date":"2026-02-11","ids":{"openalex":"https://openalex.org/W7128679511","doi":"https://doi.org/10.48550/arxiv.2602.11073"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.11073","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125718128","display_name":"Junfei Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wu, Junfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125712116","display_name":"Jian Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Jian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125756153","display_name":"Qiang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Qiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125743469","display_name":"Shu Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Shu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125693201","display_name":"Liang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Liang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125726335","display_name":"Wei Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125760611","display_name":"Tienie Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Tieniu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5125718128"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.000699999975040555,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.722100019454956},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5397999882698059},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4684000015258789},{"id":"https://openalex.org/keywords/visual-learning","display_name":"Visual learning","score":0.448199987411499},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.42910000681877136},{"id":"https://openalex.org/keywords/language-understanding","display_name":"Language understanding","score":0.37700000405311584},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.36169999837875366},{"id":"https://openalex.org/keywords/visual-language","display_name":"Visual language","score":0.3564999997615814}],"concepts":[{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.722100019454956},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.699400007724762},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5949000120162964},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5397999882698059},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4684000015258789},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.448199987411499},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38040000200271606},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.37700000405311584},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.3564999997615814},{"id":"https://openalex.org/C129671850","wikidata":"https://www.wikidata.org/wiki/Q210501","display_name":"Introspection","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33730000257492065},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3109000027179718},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.2971999943256378},{"id":"https://openalex.org/C193611912","wikidata":"https://www.wikidata.org/wiki/Q4677596","display_name":"Active vision","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C200220432","wikidata":"https://www.wikidata.org/wiki/Q7936208","display_name":"Vision science","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26460000872612},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.25049999356269836}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.11073","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.11073","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.11073","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.11073","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.7552627921104431,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"large":[1],"vision-language":[2],"models":[3],"(LVLMs)":[4],"typically":[5],"rely":[6],"on":[7,11,183],"text-only":[8],"reasoning":[9,121,164,189],"based":[10],"a":[12,88,132,137,152],"single-pass":[13],"visual":[14,23,48,64,93,123,146],"encoding,":[15],"which":[16],"often":[17,51],"leads":[18],"to":[19,33,161],"loss":[20],"of":[21,28,102],"fine-grained":[22],"information.":[24],"Recently":[25],"the":[26,46,100,106],"proposal":[27],"''thinking":[29],"with":[30,86,136,151,179],"images''":[31],"attempts":[32],"alleviate":[34],"this":[35,128],"limitation":[36],"by":[37],"manipulating":[38],"images":[39],"via":[40],"external":[41],"tools":[42],"or":[43,66,76],"code;":[44],"however,":[45],"resulting":[47],"states":[49],"are":[50],"insufficiently":[52],"grounded":[53],"in":[54,130],"linguistic":[55,120],"semantics,":[56],"impairing":[57],"effective":[58,163],"cross-modal":[59],"alignment":[60],"-":[61],"particularly":[62,180],"when":[63],"semantics":[65],"geometric":[67],"relationships":[68],"must":[69],"be":[70],"reasoned":[71],"over":[72,112],"across":[73,168],"distant":[74],"regions":[75],"multiple":[77,113],"images.":[78],"To":[79],"address":[80],"these":[81],"challenges,":[82],"we":[83],"propose":[84],"''chatting":[85],"images'',":[87],"new":[89],"framework":[90],"that":[91,172],"reframes":[92],"manipulation":[94],"as":[95],"language-guided":[96],"feature":[97],"modulation.":[98],"Under":[99],"guidance":[101],"expressive":[103],"language":[104],"prompts,":[105],"model":[107],"dynamically":[108],"performs":[109],"joint":[110],"re-encoding":[111],"image":[114],"regions,":[115],"enabling":[116],"tighter":[117],"coupling":[118],"between":[119],"and":[122,148,158,176,186],"state":[124],"updates.":[125],"We":[126],"instantiate":[127],"paradigm":[129],"ViLaVT,":[131],"novel":[133],"LVLM":[134],"equipped":[135],"dynamic":[138],"vision":[139],"encoder":[140],"explicitly":[141],"designed":[142],"for":[143],"such":[144],"interactive":[145],"reasoning,":[147],"trained":[149],"it":[150],"two-stage":[153],"curriculum":[154],"combining":[155],"supervised":[156],"fine-tuning":[157],"reinforcement":[159],"learning":[160],"promote":[162],"behaviors.":[165],"Extensive":[166],"experiments":[167],"eight":[169],"benchmarks":[170],"demonstrate":[171],"ViLaVT":[173],"achieves":[174],"strong":[175],"consistent":[177],"improvements,":[178],"pronounced":[181],"gains":[182],"complex":[184],"multi-image":[185],"video-based":[187],"spatial":[188],"tasks.":[190]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-13T00:00:00"}
