{"id":"https://openalex.org/W4416035694","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.1230","title":"Socratic-MCTS: Test-Time Visual Reasoning by Asking the Right Questions","display_name":"Socratic-MCTS: Test-Time Visual Reasoning by Asking the Right Questions","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416035694","doi":"https://doi.org/10.18653/v1/2025.emnlp-main.1230"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.emnlp-main.1230","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1230","pdf_url":"https://aclanthology.org/2025.emnlp-main.1230.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.emnlp-main.1230.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030331254","display_name":"David Acuna","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"David Acuna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102930940","display_name":"Ximing Lu","orcid":"https://orcid.org/0000-0001-6671-4573"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ximing Lu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051938345","display_name":"Jaehun Jung","orcid":"https://orcid.org/0000-0002-4856-3668"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jaehun Jung","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100330223","display_name":"Hyunwoo Kim","orcid":"https://orcid.org/0000-0003-4810-6333"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyunwoo Kim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078831654","display_name":"Amlan Kar","orcid":"https://orcid.org/0000-0003-4540-6007"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amlan Kar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070642269","display_name":"Sanja Fidler","orcid":"https://orcid.org/0000-0003-1040-3260"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sanja Fidler","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5109705925","display_name":"Yejin Choi","orcid":"https://orcid.org/0009-0003-2495-6269"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yejin Choi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5030331254"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18361389,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"24158","last_page":"24171"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.2071000039577484,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.2071000039577484,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08290000259876251,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.052000001072883606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/eye-tracking","display_name":"Eye tracking","score":0.2994999885559082},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.2775999903678894},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.25870001316070557},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.2533000111579895}],"concepts":[{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.4772000014781952},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.38370001316070557},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.37279999256134033},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34470000863075256},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.26440000534057617},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2025.emnlp-main.1230","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1230","pdf_url":"https://aclanthology.org/2025.emnlp-main.1230.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.emnlp-main.1230","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.emnlp-main.1230","pdf_url":"https://aclanthology.org/2025.emnlp-main.1230.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416035694.pdf","grobid_xml":"https://content.openalex.org/works/W4416035694.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"research":[1],"in":[2,23,123,150,162,193,256],"vision-language":[3],"models":[4,34],"(VLMs)":[5],"has":[6],"centered":[7],"around":[8],"the":[9,20,32,40,88,110,113,156,159,163,167,172,177,182,187,190,194,198,217,221,245,248,257,269,275],"possibility":[10,74],"of":[11,158,189,220],"equipping":[12],"them":[13],"with":[14,208,237],"implicit":[15],"long-form":[16],"chainof-thought":[17],"reasoning-akin":[18],"to":[19,229],"success":[21],"observed":[22],"language":[24],"models-via":[25],"distillation":[26],"and":[27,37,59,118,132],"reinforcement":[28],"learning.But":[29],"what":[30,227],"about":[31],"non-reasoning":[33],"already":[35],"trained":[36],"deployed":[38],"across":[39,129],"internet?Should":[41],"we":[42,71],"simply":[43],"abandon":[44],"them,":[45],"or":[46,67,247],"is":[47,155,214,224,235,242,261,265,272],"there":[48],"hope":[49],"for":[50],"a":[51,76,97,106,139,146,204,231,238],"search":[52,98],"mechanism":[53],"that":[54,93],"can":[55],"elicit":[56],"hidden":[57],"knowledge":[58,117],"induce":[60],"long":[61],"reasoning":[62,95,121],"traces-without":[63],"any":[64],"additional":[65],"training":[66],"supervision?In":[68],"this":[69,73],"paper,":[70],"explore":[72],"using":[75],"Monte":[77],"Carlo":[78],"Tree":[79],"Search":[80],"(MCTS)-inspired":[81],"algorithm,":[82],"which":[83,234],"injects":[84],"subquestion-subanswer":[85],"pairs":[86],"into":[87],"model's":[89],"output":[90],"stream.We":[91],"show":[92],"framing":[94],"as":[96,102,250],"process-where":[99],"subquestions":[100],"act":[101],"latent":[103],"decisions":[104],"within":[105],"broader":[107],"inference":[108],"trajectory-helps":[109],"model":[111],"\"connect":[112],"dots\"":[114],"between":[115],"fragmented":[116],"produce":[119],"extended":[120],"traces":[122],"nonreasoning":[124],"models.We":[125],"evaluate":[126],"our":[127,136],"method":[128],"three":[130],"benchmarks":[131],"observe":[133],"consistent":[134,236],"improvements.Notably,":[135],"approach":[137],"yields":[138],"2%":[140],"overall":[141],"improvement":[142],"on":[143,166,171,176,181,216,226,244,274],"MMMU-PRO,":[144],"including":[145],"significant":[147],"9%":[148],"gain":[149],"Liberal":[151],"Arts.Problem":[152],"(P)":[153],"What":[154],"position":[157,188],"blue":[160,191,212,270],"car":[161,192,213,241,260,271],"image?A:":[164],"parked":[165,175,180,273],"sidewalk":[168],"B:":[169],"driving":[170],"road":[173,183],"C:":[174],"grass":[178,246],"D:":[179],"CoT:":[184],"To":[185],"determine":[186],"image,":[195],"let's":[196],"analyze":[197],"scene:":[199],"*":[200],"The":[201,211,240,259],"image":[202],"shows":[203],"busy":[205],"urban":[206],"street":[207],"several":[209],"vehicles.*":[210],"located":[215],"right":[218],"side":[219],"image.*":[222,258],"It":[223],"positioned":[225],"appears":[228],"be":[230],"paved":[232],"surface,":[233],"road.*":[239],"not":[243],"sidewalk,":[249],"those":[251],"areas":[252],"are":[253],"clearly":[254],"distinguishable":[255],"stationary,":[262],"suggesting":[263],"it":[264],"parked.Given":[266],"these":[267],"observations,":[268],"road.":[276]},"counts_by_year":[],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-11-08T00:00:00"}
