{"id":"https://openalex.org/W4414895197","doi":"https://doi.org/10.48550/arxiv.2506.01143","title":"Linear regression with overparameterized linear neural networks: Tight upper and lower bounds for implicit $\\ell^1$-regularization","display_name":"Linear regression with overparameterized linear neural networks: Tight upper and lower bounds for implicit $\\ell^1$-regularization","publication_year":2025,"publication_date":"2025-06-01","ids":{"openalex":"https://openalex.org/W4414895197","doi":"https://doi.org/10.48550/arxiv.2506.01143"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2506.01143","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.01143","pdf_url":"https://arxiv.org/pdf/2506.01143","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2506.01143","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076049326","display_name":"Hannes Matt","orcid":"https://orcid.org/0000-0002-1578-1634"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Matt, Hannes","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5006637271","display_name":"Dominik St\u00f6ger","orcid":"https://orcid.org/0000-0002-0543-9456"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"St\u00f6ger, Dominik","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5076049326"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.9564999938011169,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.9564999938011169,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11205","display_name":"Numerical methods in inverse problems","score":0.9419999718666077,"subfield":{"id":"https://openalex.org/subfields/2610","display_name":"Mathematical Physics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9352999925613403,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.6539000272750854},{"id":"https://openalex.org/keywords/diagonal","display_name":"Diagonal","score":0.5594000220298767},{"id":"https://openalex.org/keywords/gradient-descent","display_name":"Gradient descent","score":0.5529000163078308},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.5471000075340271},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.47440001368522644},{"id":"https://openalex.org/keywords/linear-regression","display_name":"Linear regression","score":0.45249998569488525},{"id":"https://openalex.org/keywords/limit","display_name":"Limit (mathematics)","score":0.4275999963283539},{"id":"https://openalex.org/keywords/approximation-error","display_name":"Approximation error","score":0.3625999987125397},{"id":"https://openalex.org/keywords/upper-and-lower-bounds","display_name":"Upper and lower bounds","score":0.362199991941452},{"id":"https://openalex.org/keywords/linear-model","display_name":"Linear model","score":0.34220001101493835}],"concepts":[{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.7283999919891357},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.6539000272750854},{"id":"https://openalex.org/C130367717","wikidata":"https://www.wikidata.org/wiki/Q189791","display_name":"Diagonal","level":2,"score":0.5594000220298767},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.5529000163078308},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.5471000075340271},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.5212000012397766},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.47440001368522644},{"id":"https://openalex.org/C48921125","wikidata":"https://www.wikidata.org/wiki/Q10861030","display_name":"Linear regression","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C151201525","wikidata":"https://www.wikidata.org/wiki/Q177239","display_name":"Limit (mathematics)","level":2,"score":0.4275999963283539},{"id":"https://openalex.org/C122383733","wikidata":"https://www.wikidata.org/wiki/Q865920","display_name":"Approximation error","level":2,"score":0.3625999987125397},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.362199991941452},{"id":"https://openalex.org/C163175372","wikidata":"https://www.wikidata.org/wiki/Q3339222","display_name":"Linear model","level":2,"score":0.34220001101493835},{"id":"https://openalex.org/C57869625","wikidata":"https://www.wikidata.org/wiki/Q1783502","display_name":"Rate of convergence","level":3,"score":0.33889999985694885},{"id":"https://openalex.org/C177918212","wikidata":"https://www.wikidata.org/wiki/Q803623","display_name":"Perturbation (astronomy)","level":2,"score":0.3384999930858612},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.31929999589920044},{"id":"https://openalex.org/C167879884","wikidata":"https://www.wikidata.org/wiki/Q727568","display_name":"Balanced flow","level":2,"score":0.31859999895095825},{"id":"https://openalex.org/C6802819","wikidata":"https://www.wikidata.org/wiki/Q1072174","display_name":"Linear system","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C2777027219","wikidata":"https://www.wikidata.org/wiki/Q1284190","display_name":"Constant (computer programming)","level":2,"score":0.31290000677108765},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.31060001254081726},{"id":"https://openalex.org/C84545080","wikidata":"https://www.wikidata.org/wiki/Q1147936","display_name":"Condition number","level":3,"score":0.3082999885082245},{"id":"https://openalex.org/C160824197","wikidata":"https://www.wikidata.org/wiki/Q2071054","display_name":"Linear approximation","level":3,"score":0.3001999855041504},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C38349280","wikidata":"https://www.wikidata.org/wiki/Q1434290","display_name":"Flow (mathematics)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C73586568","wikidata":"https://www.wikidata.org/wiki/Q2600211","display_name":"Parameter space","level":2,"score":0.2615000009536743},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C49766605","wikidata":"https://www.wikidata.org/wiki/Q207643","display_name":"Linear map","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.25189998745918274}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2506.01143","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.01143","pdf_url":"https://arxiv.org/pdf/2506.01143","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2506.01143","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2506.01143","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.01143","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.01143","pdf_url":"https://arxiv.org/pdf/2506.01143","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"machine":[1],"learning":[2],"models":[3],"are":[4],"often":[5],"trained":[6],"in":[7,29,41,78,188],"a":[8,142],"setting":[9],"where":[10,165],"the":[11,16,23,42,96,100,108,111,122,129,134,151,166,189,195],"number":[12,17],"of":[13,18,26,83,103,136,198],"parameters":[14],"exceeds":[15],"training":[19],"samples.":[20],"To":[21],"understand":[22],"implicit":[24,69,76],"bias":[25],"gradient":[27,55,104],"descent":[28,56],"such":[30],"overparameterized":[31,88],"models,":[32],"prior":[33],"work":[34],"has":[35],"studied":[36,187],"diagonal":[37,79],"linear":[38,80,89],"neural":[39,81],"networks":[40,82],"regression":[43,90],"setting.":[44],"These":[45],"studies":[46],"have":[47],"shown":[48],"that,":[49],"when":[50],"initialized":[51],"with":[52,61,155],"small":[53],"weights,":[54],"tends":[57],"to":[58,110,181,221],"favor":[59],"solutions":[60],"minimal":[62],"$\\ell^1$-norm":[63],"-":[64],"an":[65],"effect":[66],"known":[67],"as":[68],"regularization.":[70],"In":[71],"this":[72,176],"paper,":[73],"we":[74,125],"investigate":[75],"regularization":[77],"depth":[84],"$D\\ge":[85],"2$":[86],"for":[87,147,158,225],"problems.":[91],"We":[92,193],"focus":[93],"on":[94,121,133],"analyzing":[95],"approximation":[97,123,130],"error":[98,131,152],"between":[99,145],"limit":[101],"point":[102],"flow":[105],"trajectories":[106],"and":[107,118,210],"solution":[109],"$\\ell^1$-minimization":[112],"problem.":[113],"By":[114],"deriving":[115],"tight":[116],"upper":[117],"lower":[119],"bounds":[120,200],"error,":[124],"precisely":[126],"characterize":[127],"how":[128],"depends":[132],"scale":[135],"initialization":[137,227],"$\u03b1$.":[138],"Our":[139],"results":[140],"reveal":[141],"qualitative":[143],"difference":[144],"depths:":[146],"$D":[148,216],"\\ge":[149,217],"3$,":[150,218],"decreases":[153,161],"linearly":[154],"$\u03b1$,":[156],"whereas":[157],"$D=2$,":[159],"it":[160],"at":[162],"rate":[163],"$\u03b1^{1-\\varrho}$,":[164],"parameter":[167,177],"$\\varrho":[168],"\\in":[169],"[0,1)$":[170],"can":[171],"be":[172],"explicitly":[173],"characterized.":[174],"Interestingly,":[175],"is":[178],"closely":[179],"linked":[180],"so-called":[182],"null":[183],"space":[184],"property":[185],"constants":[186],"sparse":[190],"recovery":[191],"literature.":[192],"demonstrate":[194],"asymptotic":[196],"tightness":[197],"our":[199,207],"through":[201],"explicit":[202],"examples.":[203],"Numerical":[204],"experiments":[205],"corroborate":[206],"theoretical":[208],"findings":[209],"suggest":[211],"that":[212],"deeper":[213],"networks,":[214],"i.e.,":[215],"may":[219],"lead":[220],"better":[222],"generalization,":[223],"particularly":[224],"realistic":[226],"scales.":[228]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
