{"id":"https://openalex.org/W7161939756","doi":"https://doi.org/10.48550/arxiv.2605.21104","title":"HORST: Composing Optimizer Geometries for Sparse Transformer Training","display_name":"HORST: Composing Optimizer Geometries for Sparse Transformer Training","publication_year":2026,"publication_date":"2026-05-20","ids":{"openalex":"https://openalex.org/W7161939756","doi":"https://doi.org/10.48550/arxiv.2605.21104"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.21104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.21104","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109597158","display_name":"Tom Jacobs","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacobs, Tom","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136724494","display_name":"Rohan Jain","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jain, Rohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136639495","display_name":"Rebekka Burkholz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Burkholz, Rebekka","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9111999869346619,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9111999869346619,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.026900000870227814,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.013500000350177288,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6642000079154968},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.6572999954223633},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.436599999666214},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.37549999356269836},{"id":"https://openalex.org/keywords/horst","display_name":"Horst","score":0.36079999804496765},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.3224000036716461}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6642000079154968},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.6572999954223633},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6284000277519226},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.436599999666214},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.43470001220703125},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43220001459121704},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.37549999356269836},{"id":"https://openalex.org/C2776074449","wikidata":"https://www.wikidata.org/wiki/Q212136","display_name":"Horst","level":3,"score":0.36079999804496765},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3224000036716461},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3215000033378601},{"id":"https://openalex.org/C2987595161","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Optimization algorithm","level":2,"score":0.32100000977516174},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29660001397132874},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.27149999141693115},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.26989999413490295},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.21104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.21104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.21104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Sparsifying":[0],"transformers":[1,98],"remains":[2],"a":[3,39,58,70,85],"fundamental":[4],"challenge,":[5],"as":[6,47],"standard":[7],"optimizers":[8,20],"fail":[9],"to":[10,50],"simultaneously":[11],"encourage":[12],"sparsity":[13,29,82,114],"and":[14,52,102,107],"maintain":[15],"training":[16,96],"stability.":[17],"Effective":[18],"adaptive":[19,77],"exhibit":[21],"an":[22,31],"implicit":[23],"$L_{\\infty}$":[24],"bias":[25,83],"favoring":[26],"stability,":[27],"yet,":[28],"requires":[30],"$L_1$":[32,81],"bias.":[33],"To":[34],"integrate":[35],"sparsity,":[36],"we":[37,45],"propose":[38],"composition":[40],"of":[41,97],"optimizer":[42,72],"steps,":[43],"which":[44],"cast":[46],"non-commutative":[48],"operators":[49],"analyze":[51],"combine":[53],"their":[54],"optimization":[55],"geometry":[56],"in":[57],"principled":[59],"way.":[60],"This":[61],"yields":[62],"HORST":[63,105],"(Hyperbolic":[64],"Operator":[65],"for":[66,94],"Robust":[67],"Sparse":[68],"Training),":[69],"modular":[71],"that":[73],"inherits":[74],"stability":[75],"from":[76],"methods":[78],"while":[79],"inducing":[80],"through":[84],"hyperbolic":[86],"mirror":[87],"map.":[88],"Our":[89],"experiments":[90],"demonstrate":[91],"its":[92],"utility":[93],"sparse":[95],"on":[99],"both":[100],"vision":[101],"language":[103],"tasks.":[104],"consistently":[106],"significantly":[108],"outperforms":[109],"AdamW":[110],"baselines":[111],"across":[112],"all":[113],"levels,":[115],"with":[116],"large":[117],"gains":[118],"at":[119],"higher":[120],"sparsity.":[121]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-22T00:00:00"}
