{"id":"https://openalex.org/W7135063411","doi":"https://doi.org/10.48550/arxiv.2603.10485","title":"Dual Space Preconditioning for Gradient Descent in the Overparameterized Regime","display_name":"Dual Space Preconditioning for Gradient Descent in the Overparameterized Regime","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135063411","doi":"https://doi.org/10.48550/arxiv.2603.10485"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10485","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093219798","display_name":"Reza Ghane","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghane, Reza","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036629613","display_name":"Danil Akhtiamov","orcid":"https://orcid.org/0000-0002-9238-9636"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akhtiamov, Danil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128846520","display_name":"Babak Hassibi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hassibi, Babak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.964900016784668,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.964900016784668,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12056","display_name":"Markov Chains and Monte Carlo Methods","score":0.011300000362098217,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.008899999782443047,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/iterated-function","display_name":"Iterated function","score":0.70169997215271},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.6912000179290771},{"id":"https://openalex.org/keywords/constant","display_name":"Constant (computer programming)","score":0.5005000233650208},{"id":"https://openalex.org/keywords/proximal-gradient-methods","display_name":"Proximal Gradient Methods","score":0.4948999881744385},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.48969998955726624},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.48339998722076416},{"id":"https://openalex.org/keywords/bregman-divergence","display_name":"Bregman divergence","score":0.4659000039100647},{"id":"https://openalex.org/keywords/multiplicative-function","display_name":"Multiplicative function","score":0.461899995803833},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.391400009393692}],"concepts":[{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.7645000219345093},{"id":"https://openalex.org/C140479938","wikidata":"https://www.wikidata.org/wiki/Q5254619","display_name":"Iterated function","level":2,"score":0.70169997215271},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.6912000179290771},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.5231000185012817},{"id":"https://openalex.org/C2777027219","wikidata":"https://www.wikidata.org/wiki/Q1284190","display_name":"Constant (computer programming)","level":2,"score":0.5005000233650208},{"id":"https://openalex.org/C10494615","wikidata":"https://www.wikidata.org/wiki/Q17086765","display_name":"Proximal Gradient Methods","level":4,"score":0.4948999881744385},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.48969998955726624},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.48339998722076416},{"id":"https://openalex.org/C149073432","wikidata":"https://www.wikidata.org/wiki/Q4960382","display_name":"Bregman divergence","level":2,"score":0.4659000039100647},{"id":"https://openalex.org/C42747912","wikidata":"https://www.wikidata.org/wiki/Q1048447","display_name":"Multiplicative function","level":2,"score":0.461899995803833},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.40880000591278076},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.391400009393692},{"id":"https://openalex.org/C145446738","wikidata":"https://www.wikidata.org/wiki/Q319913","display_name":"Convex function","level":3,"score":0.38029998540878296},{"id":"https://openalex.org/C112680207","wikidata":"https://www.wikidata.org/wiki/Q714886","display_name":"Regular polygon","level":2,"score":0.37700000405311584},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.34299999475479126},{"id":"https://openalex.org/C45633534","wikidata":"https://www.wikidata.org/wiki/Q752487","display_name":"Dual space","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.3310999870300293},{"id":"https://openalex.org/C157972887","wikidata":"https://www.wikidata.org/wiki/Q463359","display_name":"Convex optimization","level":3,"score":0.32760000228881836},{"id":"https://openalex.org/C115680565","wikidata":"https://www.wikidata.org/wiki/Q5977448","display_name":"Gradient method","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.3095000088214874},{"id":"https://openalex.org/C184050105","wikidata":"https://www.wikidata.org/wiki/Q273163","display_name":"Isotropy","level":2,"score":0.30790001153945923},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2971000075340271},{"id":"https://openalex.org/C153658351","wikidata":"https://www.wikidata.org/wiki/Q746264","display_name":"Constant function","level":3,"score":0.28139999508857727},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2791000008583069},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.26899999380111694},{"id":"https://openalex.org/C2777021972","wikidata":"https://www.wikidata.org/wiki/Q22976830","display_name":"Uniqueness","level":2,"score":0.2563999891281128}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10485","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.469390332698822}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In":[0],"this":[1],"work":[2],"we":[3,87,119,150,185,215,226],"study":[4,139],"the":[5,9,30,44,57,84,90,93,125,140,160,169,176,201,204],"convergence":[6,205],"properties":[7],"of":[8,29,56,92,115,124,143,168,175,207],"Dual":[10,144],"Space":[11,145],"Preconditioned":[12,146],"Gradient":[13,20,22,147],"Descent,":[14,21],"encompassing":[15],"optimizers":[16],"such":[17],"as":[18,118,182],"Normalized":[19],"Clipping":[23],"and":[24,41,76],"Adam.":[25],"We":[26,137],"consider":[27],"preconditioners":[28,174,235],"form":[31,58,177],"$\\nabla":[32],"K$,":[33],"where":[34,198],"$K:":[35],"\\mathbb{R}^p":[36],"\\to":[37],"\\mathbb{R}$":[38],"is":[39,46,200],"convex":[40],"assume":[42],"that":[43,89,131,187,228],"latter":[45],"applied":[47],"to":[48,99,134,194,237],"train":[49],"an":[50],"over-parameterized":[51],"linear":[52],"model":[53],"with":[54,128],"loss":[55],"$\\ell({X}":[59],"{W}":[60],"-":[61,191,243,248],"{Y})$,":[62],"for":[63,154,173,222,233,251],"weights":[64],"${W}":[65],"\\in":[66,72,79,103],"\\mathbb{R}^{d":[67,104],"\\times":[68,74,81,105],"k}$,":[69],"labels":[70],"${Y}":[71],"\\mathbb{R}^{n":[73,80],"k}$":[75,106],"data":[77],"${X}":[78],"d}$.":[82],"Under":[83],"aforementioned":[85],"assumptions,":[86],"prove":[88],"iterates":[91],"preconditioned":[94],"gradient":[95],"descent":[96],"always":[97],"converge":[98],"a":[100,121,165,229,238,252],"point":[101,206],"${W}_{\\infty}":[102,218],"satisfying":[107],"${X}{W}_{\\infty}":[108],"=":[109,179,196,219],"{Y}$.":[110],"Our":[111],"proof":[112],"techniques":[113],"are":[114],"independent":[116],"interest":[117],"introduce":[120],"novel":[122],"version":[123],"Bregman":[126],"Divergence":[127],"accompanying":[129],"identities":[130],"allow":[132],"us":[133],"establish":[135],"convergence.":[136],"also":[138],"implicit":[141,170],"bias":[142],"Descent.":[148],"First,":[149],"demonstrate":[151],"empirically":[152],"that,":[153],"general":[155,234],"$K(\\cdot)$,":[156],"${W}_\\infty$":[157,188],"depends":[158],"on":[159],"chosen":[161],"learning":[162],"rate,":[163],"hindering":[164],"precise":[166],"characterization":[167],"bias.":[171],"Then,":[172],"$K({G})":[178],"h(\\|{G}\\|_F)$,":[180],"known":[181],"\\textit{isotropic":[183],"preconditioners},":[184],"show":[186,227],"minimizes":[189],"$\\|{W}_\\infty":[190],"{W}_0\\|_F^2$":[192],"subject":[193],"${X}{W}_\\infty":[195],"{Y}$,":[197],"${W}_0$":[199,211],"initialization.":[202],"Denoting":[203],"GD":[208],"initialized":[209],"at":[210],"by":[212],"${W}_{\\text{GD},":[213],"\\infty}$,":[214],"thus":[216],"note":[217],"{W}_{\\text{GD},":[220,249],"\\infty}$":[221],"isotropic":[223],"preconditioners.":[224],"Finally,":[225],"similar":[230],"fact":[231],"holds":[232],"up":[236],"multiplicative":[239],"constant,":[240],"namely,":[241],"$\\|{W}_0":[242],"{W}_{\\infty}\\|_F":[244],"\\le":[245],"c":[246],"\\|{W}_0":[247],"\\infty}\\|_F$":[250],"constant":[253],"$c&gt;0$.":[254]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-13T00:00:00"}
