{"id":"https://openalex.org/W4416199165","doi":"https://doi.org/10.1145/3712285.3759783","title":"Hypertron: Efficiently Scaling Large Models by Exploring High-Dimensional Parallelization Space","display_name":"Hypertron: Efficiently Scaling Large Models by Exploring High-Dimensional Parallelization Space","publication_year":2025,"publication_date":"2025-11-12","ids":{"openalex":"https://openalex.org/W4416199165","doi":"https://doi.org/10.1145/3712285.3759783"},"language":null,"primary_location":{"id":"doi:10.1145/3712285.3759783","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759783","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3712285.3759783","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088172355","display_name":"Shigang Li","orcid":"https://orcid.org/0000-0003-0022-7865"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Shigang Li","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jingkun Dong","orcid":"https://orcid.org/0009-0002-6866-6215"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingkun Dong","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031431025","display_name":"Jinshou Chen","orcid":"https://orcid.org/0009-0007-8800-6026"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jihao Chen","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhi Ma","orcid":"https://orcid.org/0009-0009-9847-2770"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhi Ma","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075481740","display_name":"Zhongzhe Hu","orcid":"https://orcid.org/0000-0002-6708-3942"},"institutions":[{"id":"https://openalex.org/I2250955327","display_name":"Huawei Technologies (China)","ror":"https://ror.org/00cmhce21","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250955327"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongzhe Hu","raw_affiliation_strings":["Huawei, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Huawei, Beijing, China","institution_ids":["https://openalex.org/I2250955327"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5088172355"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":1.2181,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.8505195,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1755","last_page":"1768"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.42820000648498535,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.42820000648498535,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.16670000553131104,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.0632999986410141,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7570000290870667},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7078999876976013},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.6359000205993652},{"id":"https://openalex.org/keywords/dimension","display_name":"Dimension (graph theory)","score":0.5246000289916992},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5083000063896179},{"id":"https://openalex.org/keywords/automatic-parallelization","display_name":"Automatic parallelization","score":0.44830000400543213},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.44029998779296875},{"id":"https://openalex.org/keywords/parallel-processing","display_name":"Parallel processing","score":0.4129999876022339}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8263999819755554},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7570000290870667},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7570000290870667},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7078999876976013},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.6359000205993652},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.5246000289916992},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5083000063896179},{"id":"https://openalex.org/C164833996","wikidata":"https://www.wikidata.org/wiki/Q2323839","display_name":"Automatic parallelization","level":3,"score":0.44830000400543213},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.44029998779296875},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.4129999876022339},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.40689998865127563},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.40549999475479126},{"id":"https://openalex.org/C120373497","wikidata":"https://www.wikidata.org/wiki/Q1087987","display_name":"Parallel algorithm","level":2,"score":0.351500004529953},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3052999973297119},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3050999939441681},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C11644782","wikidata":"https://www.wikidata.org/wiki/Q15401790","display_name":"Cost efficiency","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.275299996137619},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.26739999651908875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3712285.3759783","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759783","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3712285.3759783","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3712285.3759783","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2215447491","display_name":null,"funder_award_id":"Grant No. 62372055","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5450936095","display_name":null,"funder_award_id":"2023ZD0120502","funder_id":"https://openalex.org/F4320329860","funder_display_name":"National Science and Technology Major Project"},{"id":"https://openalex.org/G8888975561","display_name":null,"funder_award_id":"","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320329860","display_name":"National Science and Technology Major Project","ror":null},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W2010747199","https://openalex.org/W2012652661","https://openalex.org/W2060393849","https://openalex.org/W2065231486","https://openalex.org/W2168748809","https://openalex.org/W2883830791","https://openalex.org/W2967558351","https://openalex.org/W2969388332","https://openalex.org/W2972087877","https://openalex.org/W2983101505","https://openalex.org/W2984305089","https://openalex.org/W2988647680","https://openalex.org/W3081168214","https://openalex.org/W3204998121","https://openalex.org/W3206832494","https://openalex.org/W4220967350","https://openalex.org/W4376652719","https://openalex.org/W4386709668","https://openalex.org/W4396712969","https://openalex.org/W4411486137"],"related_works":[],"abstract_inverted_index":{"Large":[0],"models":[1,25,136],"are":[2,34],"evolving":[3],"towards":[4],"massive":[5],"scale,":[6],"diverse":[7],"model":[8,77],"architectures":[9],"(dense":[10],"and":[11,13,73,104,141],"sparse)":[12],"long-context":[14],"processing,":[15],"which":[16,56,93],"makes":[17],"it":[18],"very":[19],"challenging":[20],"to":[21,38,62,78,88,118],"efficiently":[22],"scale":[23],"large":[24,135],"on":[26,124],"parallel":[27,52,109],"machines.":[28],"The":[29],"current":[30],"widely-used":[31],"parallelization":[32,41,64],"strategies":[33],"often":[35],"sub-optimal":[36],"due":[37],"their":[39],"limited":[40],"strategy":[42,92],"space.":[43],"To":[44],"this":[45],"end,":[46],"we":[47],"propose":[48],"Hypertron,":[49],"a":[50,66,74],"scalable":[51],"large-model":[53],"training":[54],"framework":[55],"incorporates":[57],"an":[58],"unprecedented":[59],"high-dimensional":[60,81,86],"(up":[61],"7D)":[63],"space,":[65],"holistic":[67],"scheme":[68],"for":[69,133],"efficient":[70],"dimension":[71],"fusion,":[72],"comprehensive":[75],"performance":[76],"guide":[79],"the":[80,85,90,148,152],"exploration.":[82],"By":[83],"exploiting":[84],"space":[87],"discover":[89],"optimal":[91],"is":[94],"not":[95],"supported":[96],"by":[97],"existing":[98],"frameworks,":[99],"Hypertron":[100,115],"significantly":[101],"reduces":[102],"memory":[103],"communication":[105],"cost":[106],"while":[107],"improving":[108],"scalability.":[110],"Extensive":[111],"evaluations":[112],"demonstrate":[113],"that":[114],"achieves":[116],"up":[117],"56.7%":[119],"Model":[120],"FLOPs":[121],"Utilization":[122],"(MFU)":[123],"2,048":[125],"new-generation":[126],"Ascend":[127],"NPU":[128],"accelerators":[129],"(scaling":[130],"with":[131,144],"supernodes)":[132],"different":[134],"(such":[137],"as":[138],"sparse":[139],"141B":[140],"dense":[142],"310B),":[143],"1.33x":[145],"speedup":[146],"over":[147],"best":[149],"configuration":[150],"of":[151],"state-of-the-art":[153],"frameworks.":[154]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-12T00:00:00"}
