{"id":"https://openalex.org/W4417297115","doi":"https://doi.org/10.48550/arxiv.2512.10938","title":"Stronger Normalization-Free Transformers","display_name":"Stronger Normalization-Free Transformers","publication_year":2025,"publication_date":"2025-12-11","ids":{"openalex":"https://openalex.org/W4417297115","doi":"https://doi.org/10.48550/arxiv.2512.10938"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2512.10938","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.10938","pdf_url":"https://arxiv.org/pdf/2512.10938","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.10938","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101538376","display_name":"Mingzhi Chen","orcid":"https://orcid.org/0000-0003-3240-909X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Mingzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099446810","display_name":"Taiming Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Taiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009199938","display_name":"Jiachen Zhu","orcid":"https://orcid.org/0000-0003-1325-3552"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Jiachen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017171970","display_name":"Mingjie Sun","orcid":"https://orcid.org/0000-0002-3697-7927"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Mingjie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100452097","display_name":"Zhuang Liu","orcid":"https://orcid.org/0000-0002-4269-8297"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6072999835014343,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.6072999835014343,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.11959999799728394,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.029500000178813934,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.5719000101089478},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.48919999599456787},{"id":"https://openalex.org/keywords/hyperbolic-function","display_name":"Hyperbolic function","score":0.3792000114917755},{"id":"https://openalex.org/keywords/gaussian","display_name":"Gaussian","score":0.37860000133514404},{"id":"https://openalex.org/keywords/cumulative-distribution-function","display_name":"Cumulative distribution function","score":0.3562999963760376},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.32359999418258667},{"id":"https://openalex.org/keywords/gaussian-process","display_name":"Gaussian process","score":0.32190001010894775}],"concepts":[{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.5719000101089478},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5460000038146973},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.48919999599456787},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44830000400543213},{"id":"https://openalex.org/C92047909","wikidata":"https://www.wikidata.org/wiki/Q204034","display_name":"Hyperbolic function","level":2,"score":0.3792000114917755},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.37860000133514404},{"id":"https://openalex.org/C103784038","wikidata":"https://www.wikidata.org/wiki/Q386228","display_name":"Cumulative distribution function","level":3,"score":0.3562999963760376},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.32359999418258667},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.32190001010894775},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.32179999351501465},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.31859999895095825},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.31470000743865967},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2888000011444092},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.28209999203681946},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C40506919","wikidata":"https://www.wikidata.org/wiki/Q7452469","display_name":"Sequence learning","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2512.10938","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.10938","pdf_url":"https://arxiv.org/pdf/2512.10938","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2512.10938","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.10938","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.10938","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.10938","pdf_url":"https://arxiv.org/pdf/2512.10938","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309292","display_name":"Princeton University","ror":"https://ror.org/00hx57361"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4417297115.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Although":[0],"normalization":[1],"layers":[2],"have":[3],"long":[4],"been":[5],"viewed":[6],"as":[7,103],"indispensable":[8],"components":[9],"of":[10,17,59,118,139],"deep":[11],"learning":[12],"architectures,":[13],"the":[14,56,94,104,136],"recent":[15],"introduction":[16],"Dynamic":[18],"Tanh":[19],"(DyT)":[20],"has":[21],"demonstrated":[22],"that":[23,48,135],"alternatives":[24],"are":[25],"possible.":[26],"The":[27],"point-wise":[28,60],"function":[29,46,79],"DyT":[30,113],"constrains":[31],"extreme":[32],"values":[33],"for":[34,45,75,162],"stable":[35],"convergence":[36],"and":[37,64,100,112,123,127,154],"reaches":[38],"normalization-level":[39],"performance;":[40],"this":[41,82],"work":[42],"seeks":[43],"further":[44],"designs":[47],"can":[49],"surpass":[50],"it.":[51],"We":[52],"first":[53],"study":[54],"how":[55],"intrinsic":[57],"properties":[58],"functions":[61],"influence":[62],"training":[63],"performance.":[65],"Building":[66],"on":[67],"these":[68],"findings,":[69],"we":[70,84],"conduct":[71],"a":[72,76,115,159],"large-scale":[73],"search":[74],"more":[77],"effective":[78],"design.":[80,107],"Through":[81],"exploration,":[83],"introduce":[85],"$\\mathrm{Derf}(x)":[86],"=":[87],"\\mathrm{erf}(\u03b1x":[88],"+":[89],"s)$,":[90],"where":[91],"$\\mathrm{erf}(x)$":[92],"is":[93],"rescaled":[95],"Gaussian":[96],"cumulative":[97],"distribution":[98],"function,":[99],"identify":[101],"it":[102],"most":[105],"performant":[106],"Derf":[108,140,158],"outperforms":[109],"LayerNorm,":[110],"RMSNorm,":[111],"across":[114],"wide":[116],"range":[117],"domains,":[119],"including":[120],"visual":[121],"recognition":[122],"generation,":[124],"speech":[125],"representation,":[126],"DNA":[128],"sequence":[129],"modeling.":[130],"Our":[131],"analysis":[132],"also":[133],"suggests":[134],"performance":[137,156],"gains":[138],"largely":[141],"stem":[142],"from":[143],"its":[144],"improved":[145],"generalization":[146],"rather":[147],"than":[148],"stronger":[149,155],"fitting":[150],"capacity.":[151],"Its":[152],"simplicity":[153],"make":[157],"practical":[160],"choice":[161],"normalization-free":[163],"Transformer":[164],"architectures.":[165]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-12-13T00:00:00"}
