{"id":"https://openalex.org/W7133362809","doi":"https://doi.org/10.48550/arxiv.2603.00165","title":"ConFoThinking: Consolidated Focused Attention Driven Thinking for Visual Question Answering","display_name":"ConFoThinking: Consolidated Focused Attention Driven Thinking for Visual Question Answering","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7133362809","doi":"https://doi.org/10.48550/arxiv.2603.00165"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00165","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00165","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00165","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5077047101","display_name":"Zhaodong Wu","orcid":"https://orcid.org/0000-0002-1537-0659"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wu, Zhaodong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127885202","display_name":"Haochen Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Haochen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127874861","display_name":"Qi Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127898331","display_name":"Wenqi Mo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mo, Wenqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101089172","display_name":"Yu Pei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pei, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100572102","display_name":"Wenqi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Wenqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127906165","display_name":"Jionglong Su","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Jionglong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128010275","display_name":"Yang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5077047101"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.972000002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.972000002861023,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.005900000222027302,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.003700000001117587,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.7218999862670898},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.618399977684021},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5400999784469604},{"id":"https://openalex.org/keywords/zoom","display_name":"Zoom","score":0.5056999921798706},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4943000078201294},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4075999855995178},{"id":"https://openalex.org/keywords/aggregate","display_name":"Aggregate (composite)","score":0.4034000039100647},{"id":"https://openalex.org/keywords/visual-attention","display_name":"Visual attention","score":0.3785000145435333},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.334199994802475}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.7218999862670898},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.618399977684021},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6018000245094299},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5400999784469604},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5177000164985657},{"id":"https://openalex.org/C124913957","wikidata":"https://www.wikidata.org/wiki/Q1232548","display_name":"Zoom","level":3,"score":0.5056999921798706},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4943000078201294},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.4431999921798706},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4075999855995178},{"id":"https://openalex.org/C4679612","wikidata":"https://www.wikidata.org/wiki/Q866298","display_name":"Aggregate (composite)","level":2,"score":0.4034000039100647},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38370001316070557},{"id":"https://openalex.org/C2986089797","wikidata":"https://www.wikidata.org/wiki/Q6501338","display_name":"Visual attention","level":3,"score":0.3785000145435333},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.334199994802475},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3255999982357025},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C197914299","wikidata":"https://www.wikidata.org/wiki/Q18650","display_name":"Semantic memory","level":3,"score":0.29420000314712524},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.29030001163482666},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.28760001063346863},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.2630000114440918},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2628999948501587},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C93173128","wikidata":"https://www.wikidata.org/wiki/Q287827","display_name":"Inattentional blindness","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00165","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00165","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00165","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00165","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Thinking":[0,100],"with":[1],"Images":[2],"improves":[3,158],"fine-grained":[4],"VQA":[5,153],"for":[6,24,121],"MLLMs":[7,69],"by":[8,43,93,144],"emphasizing":[9],"visual":[10,123],"cues.":[11],"However,":[12],"tool-augmented":[13],"methods":[14,29],"depend":[15],"on":[16,58],"the":[17,32,73,140],"capacity":[18],"of":[19,34,133],"grounding,":[20],"which":[21,113,138],"remains":[22],"unreliable":[23],"MLLMs.":[25],"In":[26],"parallel,":[27],"attention-driven":[28],"to":[30,52,72,104,135],"crop":[31],"Region":[33],"Interest":[35],"(ROIs)":[36],"are":[37,41],"proposed":[38],"but":[39],"they":[40],"constrained":[42],"(1)":[44],"fragmented":[45,84],"attention":[46,62,81,88,106,128,148],"signals":[47],"scattered":[48],"across":[49,85,151],"layers,":[50,86],"leading":[51],"suboptimal":[53],"localization":[54],"and":[55,87,116,164],"(2)":[56],"relying":[57],"question-":[59,145],"or":[60,146],"redundant-text-conditioned":[61],"extraction.":[63,149],"Our":[64],"analysis":[65],"reveals":[66],"three":[67],"patterns:":[68],"may":[70],"attend":[71],"correct":[74],"region":[75],"yet":[76],"generate":[77],"incorrect":[78],"coordinates,":[79],"where-to-look":[80],"is":[82,90],"often":[83],"extraction":[89],"query-sensitive.":[91],"Motivated":[92],"these,":[94],"We":[95],"propose":[96],"ConFoThinking,":[97],"a":[98,108],"Consolidated-Focused-Attention-Driven":[99],"framework":[101],"that":[102],"learns":[103],"aggregate":[105],"into":[107],"designated":[109],"intermediate":[110],"layer,":[111],"from":[112],"we":[114,126],"mine":[115],"zoom":[117],"in":[118],"salient":[119],"regions":[120],"downstream":[122],"understanding.":[124],"Moreover,":[125],"extract":[127],"using":[129],"concise":[130],"semantic":[131,141],"cues":[132],"what":[134],"look":[136],"into,":[137],"mitigates":[139],"noise":[142],"introduced":[143],"redundant-text-based":[147],"Experiments":[150],"five":[152],"benchmarks":[154],"demonstrate":[155],"ConFoThinking":[156],"significantly":[157],"perception":[159],"performance.":[160],"The":[161],"code,":[162],"checkpoints,":[163],"dataset":[165],"will":[166],"be":[167],"released":[168],"after":[169],"being":[170],"accepted.":[171]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
