{"id":"https://openalex.org/W7140413880","doi":"https://doi.org/10.48550/arxiv.2603.23998","title":"Sparse Growing Transformer: Training-Time Sparse Depth Allocation via Progressive Attention Looping","display_name":"Sparse Growing Transformer: Training-Time Sparse Depth Allocation via Progressive Attention Looping","publication_year":2026,"publication_date":"2026-03-25","ids":{"openalex":"https://openalex.org/W7140413880","doi":"https://doi.org/10.48550/arxiv.2603.23998"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23998","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23998","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23998","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130683956","display_name":"Yao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Yao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130683150","display_name":"Yilong Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yilong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130692648","display_name":"Yinqi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yinqi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023311055","display_name":"Junyuan Shang","orcid":"https://orcid.org/0000-0003-4301-750X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shang, Junyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130633907","display_name":"Zhenyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130680920","display_name":"Zefeng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zefeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130704072","display_name":"Shuaiyi Nie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nie, Shuaiyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130698171","display_name":"Shuohuan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuohuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130646331","display_name":"Yu Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130660341","display_name":"Hua Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Hua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130713406","display_name":"HaiFeng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, HaiFeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130676572","display_name":"Tingwen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Tingwen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5130683956"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.33640000224113464,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.33640000224113464,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.23270000517368317,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11019","display_name":"Image Enhancement Techniques","score":0.08540000021457672,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6625999808311462},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.6601999998092651},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.5644000172615051},{"id":"https://openalex.org/keywords/a-priori-and-a-posteriori","display_name":"A priori and a posteriori","score":0.4074999988079071},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4050999879837036},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.3977000117301941},{"id":"https://openalex.org/keywords/rigidity","display_name":"Rigidity (electromagnetism)","score":0.3864000141620636}],"concepts":[{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6625999808311462},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.6601999998092651},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6571999788284302},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.5644000172615051},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5586000084877014},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4334000051021576},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.4074999988079071},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4050999879837036},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3977000117301941},{"id":"https://openalex.org/C160343418","wikidata":"https://www.wikidata.org/wiki/Q185256","display_name":"Rigidity (electromagnetism)","level":2,"score":0.3864000141620636},{"id":"https://openalex.org/C1893757","wikidata":"https://www.wikidata.org/wiki/Q3653001","display_name":"Inversion (geology)","level":3,"score":0.3314000070095062},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C2780186347","wikidata":"https://www.wikidata.org/wiki/Q11414","display_name":"Subnetwork","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C73586568","wikidata":"https://www.wikidata.org/wiki/Q2600211","display_name":"Parameter space","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.2976999878883362},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2700999975204468},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2669000029563904}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23998","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23998","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23998","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23998","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"approaches":[1],"to":[2,38,54,128,186,190],"increasing":[3,145],"the":[4,22,28,42,109,178],"effective":[5],"depth":[6,34,65,119,146],"of":[7,152],"Transformers":[8],"predominantly":[9],"rely":[10],"on":[11,135],"parameter":[12,43,51,161],"reuse,":[13],"extending":[14],"computation":[15],"through":[16],"recursive":[17],"execution.":[18],"Under":[19],"this":[20,105],"paradigm,":[21],"network":[23],"structure":[24],"remains":[25],"static":[26,73,169],"along":[27],"training":[29,48,68,155,180],"timeline,":[30],"and":[31,50],"additional":[32,179],"computational":[33,56],"is":[35,115],"uniformly":[36],"assigned":[37],"entire":[39],"blocks":[40],"at":[41],"level.":[44],"This":[45,138],"rigidity":[46],"across":[47,90,159],"time":[49],"space":[52],"leads":[53],"substantial":[55],"redundancy":[57],"during":[58,67],"training.":[59],"In":[60],"contrast,":[61],"we":[62,107],"argue":[63],"that":[64,122,164],"allocation":[66,120],"should":[69],"not":[70],"be":[71],"a":[72,77,86,97,116,149,191],"preset,":[74],"but":[75],"rather":[76],"progressively":[78,123],"growing":[79],"structural":[80,141],"process.":[81],"Our":[82],"systematic":[83],"analysis":[84],"reveals":[85],"deep-to-shallow":[87],"maturation":[88],"trajectory":[89],"layers,":[91],"where":[92],"high-entropy":[93],"attention":[94,133],"heads":[95],"play":[96],"crucial":[98],"role":[99],"in":[100],"semantic":[101],"integration.":[102],"Motivated":[103],"by":[104,143],"observation,":[106],"introduce":[108],"Sparse":[110],"Growing":[111],"Transformer":[112,193],"(SGT).":[113],"SGT":[114,165],"training-time":[117,168],"sparse":[118],"framework":[121],"extends":[124],"recurrence":[125],"from":[126,183],"deeper":[127],"shallower":[129],"layers":[130],"via":[131],"targeted":[132],"looping":[134,171],"informative":[136],"heads.":[137],"mechanism":[139],"induces":[140],"sparsity":[142],"selectively":[144],"only":[147,187],"for":[148],"small":[150],"subset":[151],"parameters":[153],"as":[154],"evolves.":[156],"Extensive":[157],"experiments":[158],"multiple":[160],"scales":[162],"demonstrate":[163],"consistently":[166],"outperforms":[167],"block-level":[170],"baselines":[172],"under":[173],"comparable":[174],"settings,":[175],"while":[176],"reducing":[177],"FLOPs":[181],"overhead":[182],"approximately":[184],"16--20%":[185],"1--3%":[188],"relative":[189],"standard":[192],"backbone.":[194]},"counts_by_year":[],"updated_date":"2026-03-27T06:05:27.210665","created_date":"2026-03-27T00:00:00"}
