{"id":"https://openalex.org/W7147649184","doi":"https://doi.org/10.48550/arxiv.2603.29108","title":"Efficient Bilevel Optimization with KFAC-Based Hypergradients","display_name":"Efficient Bilevel Optimization with KFAC-Based Hypergradients","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7147649184","doi":"https://doi.org/10.48550/arxiv.2603.29108"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29108","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29108","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29108","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132697773","display_name":"Disen Liao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liao, Disen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012954635","display_name":"Felix Dangel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dangel, Felix","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5125737670","display_name":"Yaoliang Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Yaoliang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5132697773"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.824999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.824999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11206","display_name":"Model Reduction and Neural Networks","score":0.02979999966919422,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.02199999988079071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/curvature","display_name":"Curvature","score":0.6230000257492065},{"id":"https://openalex.org/keywords/conjugate-gradient-method","display_name":"Conjugate gradient method","score":0.6115999817848206},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5239999890327454},{"id":"https://openalex.org/keywords/bilevel-optimization","display_name":"Bilevel optimization","score":0.4650999903678894},{"id":"https://openalex.org/keywords/inverse","display_name":"Inverse","score":0.45509999990463257},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.44690001010894775},{"id":"https://openalex.org/keywords/von-neumann-architecture","display_name":"Von Neumann architecture","score":0.42410001158714294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6668000221252441},{"id":"https://openalex.org/C195065555","wikidata":"https://www.wikidata.org/wiki/Q214881","display_name":"Curvature","level":2,"score":0.6230000257492065},{"id":"https://openalex.org/C81184566","wikidata":"https://www.wikidata.org/wiki/Q1191895","display_name":"Conjugate gradient method","level":2,"score":0.6115999817848206},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.5978000164031982},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5239999890327454},{"id":"https://openalex.org/C3309286","wikidata":"https://www.wikidata.org/wiki/Q4907693","display_name":"Bilevel optimization","level":3,"score":0.4650999903678894},{"id":"https://openalex.org/C207467116","wikidata":"https://www.wikidata.org/wiki/Q4385666","display_name":"Inverse","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.44690001010894775},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.42800000309944153},{"id":"https://openalex.org/C80469333","wikidata":"https://www.wikidata.org/wiki/Q189088","display_name":"Von Neumann architecture","level":2,"score":0.42410001158714294},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.33719998598098755},{"id":"https://openalex.org/C135252773","wikidata":"https://www.wikidata.org/wiki/Q1567213","display_name":"Inverse problem","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.33550000190734863},{"id":"https://openalex.org/C115680565","wikidata":"https://www.wikidata.org/wiki/Q5977448","display_name":"Gradient method","level":2,"score":0.329800009727478},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2800999879837036},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25690001249313354},{"id":"https://openalex.org/C62354387","wikidata":"https://www.wikidata.org/wiki/Q875399","display_name":"Boundary (topology)","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2549999952316284},{"id":"https://openalex.org/C177179195","wikidata":"https://www.wikidata.org/wiki/Q7268372","display_name":"Quadratic unconstrained binary optimization","level":4,"score":0.25200000405311584},{"id":"https://openalex.org/C31487907","wikidata":"https://www.wikidata.org/wiki/Q1154597","display_name":"Polygon mesh","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29108","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29108","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29108","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29108","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Bilevel":[0],"optimization":[1],"(BO)":[2],"is":[3,106,124],"widely":[4],"applicable":[5],"to":[6,57,99],"many":[7],"machine":[8],"learning":[9],"problems.":[10,95],"Scaling":[11],"BO,":[12],"however,":[13],"requires":[14],"repeatedly":[15],"computing":[16],"hypergradients,":[17],"which":[18,44],"involves":[19],"solving":[20],"inverse":[21],"Hessian-vector":[22],"products":[23],"(IHVPs).":[24],"In":[25],"practice,":[26],"these":[27],"operations":[28],"are":[29],"often":[30],"approximated":[31],"using":[32],"crude":[33],"surrogates":[34],"such":[35],"as":[36],"one-step":[37],"gradient":[38],"unrolling":[39],"or":[40,76],"identity/short":[41],"Neumann":[42,77],"expansions,":[43],"discard":[45],"curvature":[46,61,104],"information.":[47],"We":[48,83],"build":[49],"on":[50],"implicit":[51],"function":[52],"theorem-based":[53],"algorithms":[54],"and":[55,79,92,110,119],"propose":[56],"incorporate":[58],"Kronecker-factored":[59],"approximate":[60],"(KFAC),":[62],"yielding":[63],"curvature-aware":[64],"hypergradients":[65],"with":[66,115],"a":[67],"better":[68],"performance":[69],"efficiency":[70],"trade-off":[71],"than":[72],"Conjugate":[73],"Gradient":[74],"(CG)":[75],"methods":[78],"consistently":[80],"outperforming":[81],"unrolling.":[82],"evaluate":[84],"this":[85],"approach":[86],"across":[87],"diverse":[88],"tasks,":[89],"including":[90],"meta-learning":[91],"AI":[93],"safety":[94],"On":[96],"models":[97],"up":[98],"BERT,":[100],"we":[101],"show":[102],"that":[103],"information":[105],"valuable":[107],"at":[108,126],"scale,":[109],"KFAC":[111],"can":[112],"provide":[113],"it":[114],"only":[116],"modest":[117],"memory":[118],"runtime":[120],"overhead.":[121],"Our":[122],"implementation":[123],"available":[125],"https://github.com/liaodisen/NeuralBo.":[127]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
