{"id":"https://openalex.org/W4415125289","doi":"https://doi.org/10.1109/icnp65844.2025.11192443","title":"MAZ3: Memory-Assisted ZeRO-3 for Efficient Collective Communication","display_name":"MAZ3: Memory-Assisted ZeRO-3 for Efficient Collective Communication","publication_year":2025,"publication_date":"2025-09-22","ids":{"openalex":"https://openalex.org/W4415125289","doi":"https://doi.org/10.1109/icnp65844.2025.11192443"},"language":"en","primary_location":{"id":"doi:10.1109/icnp65844.2025.11192443","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp65844.2025.11192443","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101600369","display_name":"Yang Liu","orcid":"https://orcid.org/0009-0006-6347-0183"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yang Liu","raw_affiliation_strings":["Northeastern University,China"],"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093548286","display_name":"Chenyang Hei","orcid":"https://orcid.org/0000-0001-5010-1529"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenyang Hei","raw_affiliation_strings":["Northeastern University,China"],"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083598234","display_name":"Fuliang Li","orcid":"https://orcid.org/0000-0001-9782-0053"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fuliang Li","raw_affiliation_strings":["Northeastern University,China"],"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106856666","display_name":"Chengxi Gao","orcid":"https://orcid.org/0000-0003-1386-7394"},"institutions":[{"id":"https://openalex.org/I4210145761","display_name":"Shenzhen Institutes of Advanced Technology","ror":"https://ror.org/04gh4er46","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210145761"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chengxi Gao","raw_affiliation_strings":["Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,Shenzhen Institutes of Advanced Technology","institution_ids":["https://openalex.org/I4210145761"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100326904","display_name":"Xingwei Wang","orcid":"https://orcid.org/0000-0001-7605-218X"},"institutions":[{"id":"https://openalex.org/I9224756","display_name":"Northeastern University","ror":"https://ror.org/03awzbc87","country_code":"CN","type":"education","lineage":["https://openalex.org/I9224756"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingwei Wang","raw_affiliation_strings":["Northeastern University,China"],"affiliations":[{"raw_affiliation_string":"Northeastern University,China","institution_ids":["https://openalex.org/I9224756"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101600369"],"corresponding_institution_ids":["https://openalex.org/I9224756"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35891037,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.8564000129699707,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13382","display_name":"Robotics and Automated Systems","score":0.8564000129699707,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.8222000002861023,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12784","display_name":"Modular Robots and Swarm Intelligence","score":0.7379999756813049,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5526999831199646},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.5472999811172485},{"id":"https://openalex.org/keywords/broadcasting","display_name":"Broadcasting (networking)","score":0.4372999966144562},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.43320000171661377},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.41359999775886536},{"id":"https://openalex.org/keywords/memory-model","display_name":"Memory model","score":0.4068000018596649},{"id":"https://openalex.org/keywords/shared-memory","display_name":"Shared memory","score":0.38449999690055847},{"id":"https://openalex.org/keywords/distributed-shared-memory","display_name":"Distributed shared memory","score":0.3375000059604645}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8141000270843506},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5741999745368958},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5526999831199646},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5472999811172485},{"id":"https://openalex.org/C110157686","wikidata":"https://www.wikidata.org/wiki/Q922122","display_name":"Broadcasting (networking)","level":2,"score":0.4372999966144562},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.43320000171661377},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.41359999775886536},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.4068000018596649},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.38449999690055847},{"id":"https://openalex.org/C39528615","wikidata":"https://www.wikidata.org/wiki/Q1229610","display_name":"Distributed shared memory","level":5,"score":0.3375000059604645},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.33629998564720154},{"id":"https://openalex.org/C57863822","wikidata":"https://www.wikidata.org/wiki/Q905488","display_name":"Flat memory model","level":4,"score":0.3357999920845032},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.32249999046325684},{"id":"https://openalex.org/C63511323","wikidata":"https://www.wikidata.org/wiki/Q908936","display_name":"Interleaved memory","level":4,"score":0.3190000057220459},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.2985000014305115},{"id":"https://openalex.org/C51290061","wikidata":"https://www.wikidata.org/wiki/Q1936765","display_name":"Uniform memory access","level":4,"score":0.2980000078678131},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.29589998722076416},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2849000096321106},{"id":"https://openalex.org/C41036726","wikidata":"https://www.wikidata.org/wiki/Q844824","display_name":"Physical address","level":3,"score":0.2797999978065491},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2680000066757202}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icnp65844.2025.11192443","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icnp65844.2025.11192443","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 33rd International Conference on Network Protocols (ICNP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1990962327","https://openalex.org/W2152839228","https://openalex.org/W2798515322","https://openalex.org/W2979719709","https://openalex.org/W3043443960","https://openalex.org/W3129831491","https://openalex.org/W3138516171","https://openalex.org/W3139689176","https://openalex.org/W3193985311","https://openalex.org/W3205803342","https://openalex.org/W4312060029","https://openalex.org/W4318541551","https://openalex.org/W4320495408","https://openalex.org/W4321636583","https://openalex.org/W4386768656","https://openalex.org/W4394998892","https://openalex.org/W4395117348"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"advanced":[5],"rapidly,":[6],"but":[7,54],"their":[8],"growing":[9],"parameter":[10],"scales":[11],"and":[12,37,48,62,91,120,136,151,160,181,193],"memory":[13,23,42,52,83,90,172,191],"demands":[14],"pose":[15],"critical":[16],"challenges":[17],"for":[18],"distributed":[19,69],"training.":[20],"Although":[21],"GPU":[22],"capacity":[24],"improves":[25,155],"steadily,":[26],"model":[27,108,148],"sizes":[28],"expand":[29],"much":[30],"faster,":[31],"causing":[32],"frequent":[33],"Out-of-Memory":[34],"(OOM)":[35],"errors":[36],"rising":[38],"training":[39,70,156,179],"costs.":[40],"Existing":[41],"optimization":[43,192],"approaches,":[44],"such":[45],"as":[46],"ZeRO-3":[47,175],"offloading,":[49],"alleviate":[50],"per-GPU":[51],"pressure":[53],"introduce":[55],"excessive":[56],"collective":[57,105],"communication,":[58],"limited":[59],"computation\u2013communication":[60],"overlap,":[61,103],"degraded":[63],"scalability.":[64],"We":[65,129],"present":[66],"MAZ3,":[67],"a":[68,133,187],"framework":[71],"that":[72,144],"mitigates":[73],"these":[74],"limitations":[75],"through":[76],"three":[77],"key":[78],"techniques:":[79],"(1)":[80],"Collaborative":[81],"CPU\u2013GPU":[82],"management,":[84],"storing":[85],"full":[86],"parameters":[87],"in":[88],"CPU":[89],"broadcasting":[92],"them":[93],"within":[94],"nodes":[95],"to":[96,110,125],"reduce":[97],"global":[98],"synchronization;":[99],"(2)":[100],"Fine-grained":[101],"communication\u2013computation":[102],"aligning":[104],"operations":[106],"with":[107,139,166],"computation":[109],"hide":[111],"latency;":[112],"(3)":[113],"Hierarchical":[114],"aggregation":[115],"operators,":[116],"leveraging":[117],"intra-node":[118],"NVLink":[119],"inter-node":[121,147],"NIC":[122],"channels":[123],"concurrently":[124],"minimize":[126],"communication":[127,149],"overhead.":[128],"implement":[130],"MAZ3":[131,145,169],"on":[132],"multi-GPU":[134],"cluster":[135],"evaluate":[137],"it":[138],"large-scale":[140],"models.":[141],"Results":[142],"show":[143],"reduces":[146],"(gradients":[150],"parameters)":[152],"by":[153,158,163],"33%,":[154],"efficiency":[157,173,180],"40.3%,":[159],"increases":[161],"throughput":[162,182],"67.9%":[164],"compared":[165],"ZeRO-3.":[167],"Moreover,":[168],"retains":[170],"the":[171,178],"of":[174,183],"while":[176],"approaching":[177],"ZeRO-2":[184],"Offload,":[185],"achieving":[186],"balanced":[188],"trade-off":[189],"between":[190],"performance.":[194]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-14T00:00:00"}
