{"id":"https://openalex.org/W7135230757","doi":"https://doi.org/10.48550/arxiv.2603.12166","title":"LatentGeo: Learnable Auxiliary Constructions in Latent Space for Multimodal Geometric Reasoning","display_name":"LatentGeo: Learnable Auxiliary Constructions in Latent Space for Multimodal Geometric Reasoning","publication_year":2026,"publication_date":"2026-03-12","ids":{"openalex":"https://openalex.org/W7135230757","doi":"https://doi.org/10.48550/arxiv.2603.12166"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.12166","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12166","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.12166","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129039263","display_name":"Haiying Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Haiying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128999577","display_name":"Zihan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zihan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100824896","display_name":"Song Dai","orcid":"https://orcid.org/0009-0003-5413-7635"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Song","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084540711","display_name":"Zhengxuan Zhang","orcid":"https://orcid.org/0000-0002-3370-1976"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhengxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128978488","display_name":"Kairan Dou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dou, Kairan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128956858","display_name":"Xuming Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Xuming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6741999983787537,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.6741999983787537,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11596","display_name":"Constraint Satisfaction and Optimization","score":0.038100000470876694,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.02930000051856041,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.49709999561309814},{"id":"https://openalex.org/keywords/geometric-programming","display_name":"Geometric programming","score":0.47600001096725464},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.4569000005722046},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4528999924659729},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.40139999985694885},{"id":"https://openalex.org/keywords/interleaving","display_name":"Interleaving","score":0.399399995803833},{"id":"https://openalex.org/keywords/geometric-transformation","display_name":"Geometric transformation","score":0.3797999918460846},{"id":"https://openalex.org/keywords/geometric-modeling","display_name":"Geometric modeling","score":0.3314000070095062}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5831000208854675},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5217000246047974},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.49709999561309814},{"id":"https://openalex.org/C20729856","wikidata":"https://www.wikidata.org/wiki/Q2078279","display_name":"Geometric programming","level":2,"score":0.47600001096725464},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.4569000005722046},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4528999924659729},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.40139999985694885},{"id":"https://openalex.org/C28034677","wikidata":"https://www.wikidata.org/wiki/Q17092530","display_name":"Interleaving","level":2,"score":0.399399995803833},{"id":"https://openalex.org/C56435381","wikidata":"https://www.wikidata.org/wiki/Q1196371","display_name":"Geometric transformation","level":3,"score":0.3797999918460846},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33889999985694885},{"id":"https://openalex.org/C104065381","wikidata":"https://www.wikidata.org/wiki/Q1002535","display_name":"Geometric modeling","level":2,"score":0.3314000070095062},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.32089999318122864},{"id":"https://openalex.org/C7305733","wikidata":"https://www.wikidata.org/wiki/Q207961","display_name":"Geometric shape","level":2,"score":0.3147999942302704},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.30970001220703125},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3005000054836273},{"id":"https://openalex.org/C136119220","wikidata":"https://www.wikidata.org/wiki/Q1000660","display_name":"Algebra over a field","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.29170000553131104},{"id":"https://openalex.org/C73301696","wikidata":"https://www.wikidata.org/wiki/Q5469984","display_name":"Formalism (music)","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.26080000400543213}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.12166","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12166","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.12166","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.12166","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5434033274650574}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"recent":[1],"advances":[2],"in":[3,199],"multimodal":[4,15],"reasoning,":[5,50],"representing":[6],"auxiliary":[7,102,125,185],"geometric":[8,45,53,74,103,179],"constructions":[9,21,104],"remains":[10],"a":[11,92,113,131,156],"fundamental":[12],"challenge":[13],"for":[14],"large":[16],"language":[17],"models":[18],"(MLLMs).":[19],"Such":[20],"are":[22],"absent":[23],"from":[24],"the":[25,194],"original":[26],"diagram":[27],"and":[28,51,72,119,164,169,189],"must":[29],"be":[30],"introduced":[31],"before":[32],"theorems":[33],"apply.":[34],"Existing":[35],"approaches":[36],"predominantly":[37],"rely":[38,77],"on":[39,78,167,178],"explicit":[40],"construction":[41],"paradigms,":[42],"including":[43],"text-based":[44],"specification,":[46],"visual-token":[47],"interleaving":[48],"during":[49,140],"tool-augmented":[52],"execution.":[54],"However,":[55],"these":[56,87,121],"methods":[57],"either":[58],"fail":[59],"to":[60,100],"faithfully":[61],"represent":[62],"complex":[63],"spatial":[64],"relationships,":[65],"incur":[66],"representation":[67,151],"mismatch":[68],"between":[69],"discrete":[70],"symbols":[71],"continuous":[73,96],"structures,":[75],"or":[76,108],"external":[79,109],"capabilities":[80],"that":[81,94,116,136,173],"hinder":[82],"end-to-end":[83],"optimization.":[84],"To":[85,147],"address":[86],"limitations,":[88],"we":[89,153],"propose":[90],"LatentGeo,":[91],"framework":[93],"learns":[95],"latent":[97,122,138],"visual":[98,126],"representations":[99,123,139],"internalize":[101],"without":[105],"pixel-level":[106],"rendering":[107],"executors.":[110],"We":[111],"design":[112],"three-stage":[114],"curriculum":[115],"progressively":[117],"aligns":[118],"internalizes":[120],"through":[124],"supervision,":[127],"followed":[128],"by":[129],"LaGDPO,":[130],"latent-aware":[132],"reinforcement":[133],"learning":[134],"procedure":[135],"stabilizes":[137],"policy":[141],"optimization":[142],"while":[143],"improving":[144],"end-task":[145],"correctness.":[146],"systematically":[148],"evaluate":[149],"construction-centric":[150],"quality,":[152],"introduce":[154],"GeoAux,":[155],"new":[157],"benchmark":[158],"targeting":[159],"visually":[160],"dependent":[161],"geometry":[162],"problems,":[163],"conduct":[165],"experiments":[166],"GeoAux":[168],"MathVerse.":[170],"Results":[171],"show":[172],"LatentGeo":[174],"achieves":[175],"substantial":[176],"gains":[177],"reasoning":[180],"tasks,":[181],"particularly":[182],"those":[183],"requiring":[184],"constructions.":[186],"Extensive":[187],"analyses":[188],"ablation":[190],"studies":[191],"further":[192],"validate":[193],"effectiveness":[195],"of":[196],"each":[197],"component":[198],"our":[200],"framework.":[201]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-14T00:00:00"}
