{"id":"https://openalex.org/W4360831999","doi":"https://doi.org/10.1109/hpca56546.2023.10071077","title":"MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism","display_name":"MPress: Democratizing Billion-Scale Model Training on Multi-GPU Servers via Memory-Saving Inter-Operator Parallelism","publication_year":2023,"publication_date":"2023-02-01","ids":{"openalex":"https://openalex.org/W4360831999","doi":"https://doi.org/10.1109/hpca56546.2023.10071077"},"language":"en","primary_location":{"id":"doi:10.1109/hpca56546.2023.10071077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca56546.2023.10071077","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049212830","display_name":"Quan Zhou","orcid":"https://orcid.org/0009-0000-9613-2478"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Quan Zhou","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101567973","display_name":"Haiquan Wang","orcid":"https://orcid.org/0000-0003-1745-9814"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haiquan Wang","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100695163","display_name":"Xiaoyan Yu","orcid":"https://orcid.org/0000-0003-0351-8393"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoyan Yu","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100354226","display_name":"Cheng Li","orcid":"https://orcid.org/0000-0001-7064-6120"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210154819","display_name":"Centre for High Performance Computing","ror":"https://ror.org/0409egh51","country_code":"ZA","type":"other","lineage":["https://openalex.org/I4210154819"]}],"countries":["CN","ZA"],"is_corresponding":false,"raw_author_name":"Cheng Li","raw_affiliation_strings":["University of Science and Technology of China","Anhui Province Key Laboratory of High Performance Computing"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Anhui Province Key Laboratory of High Performance Computing","institution_ids":["https://openalex.org/I4210154819"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037054290","display_name":"Youhui Bai","orcid":null},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Youhui Bai","raw_affiliation_strings":["University of Science and Technology of China"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100381152","display_name":"Feng Yan","orcid":"https://orcid.org/0000-0001-9840-7754"},"institutions":[{"id":"https://openalex.org/I44461941","display_name":"University of Houston","ror":"https://ror.org/048sx0r50","country_code":"US","type":"education","lineage":["https://openalex.org/I44461941"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Feng Yan","raw_affiliation_strings":["University of Houston"],"affiliations":[{"raw_affiliation_string":"University of Houston","institution_ids":["https://openalex.org/I44461941"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070385135","display_name":"Yinlong Xu","orcid":"https://orcid.org/0000-0001-9586-0561"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210154819","display_name":"Centre for High Performance Computing","ror":"https://ror.org/0409egh51","country_code":"ZA","type":"other","lineage":["https://openalex.org/I4210154819"]}],"countries":["CN","ZA"],"is_corresponding":false,"raw_author_name":"Yinlong Xu","raw_affiliation_strings":["University of Science and Technology of China","Anhui Province Key Laboratory of High Performance Computing"],"affiliations":[{"raw_affiliation_string":"University of Science and Technology of China","institution_ids":["https://openalex.org/I126520041"]},{"raw_affiliation_string":"Anhui Province Key Laboratory of High Performance Computing","institution_ids":["https://openalex.org/I4210154819"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5049212830"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":2.0412,"has_fulltext":false,"cited_by_count":17,"citation_normalized_percentile":{"value":0.88705425,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"556","last_page":"569"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9733999967575073,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8502854108810425},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6826284527778625},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.6331298351287842},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5247737765312195},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.47835296392440796},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.46967822313308716},{"id":"https://openalex.org/keywords/memory-model","display_name":"Memory model","score":0.44443172216415405},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2838420867919922},{"id":"https://openalex.org/keywords/shared-memory","display_name":"Shared memory","score":0.2729649245738983},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.24498674273490906}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8502854108810425},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6826284527778625},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.6331298351287842},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5247737765312195},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.47835296392440796},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.46967822313308716},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.44443172216415405},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2838420867919922},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.2729649245738983},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.24498674273490906}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca56546.2023.10071077","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca56546.2023.10071077","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":68,"referenced_works":["https://openalex.org/W1493774699","https://openalex.org/W1724438581","https://openalex.org/W1902934009","https://openalex.org/W2119144962","https://openalex.org/W2285660444","https://openalex.org/W2319920447","https://openalex.org/W2330958039","https://openalex.org/W2338908902","https://openalex.org/W2489529491","https://openalex.org/W2763421725","https://openalex.org/W2809901035","https://openalex.org/W2883830791","https://openalex.org/W2891993230","https://openalex.org/W2896457183","https://openalex.org/W2900824371","https://openalex.org/W2952777853","https://openalex.org/W2963674932","https://openalex.org/W2969388332","https://openalex.org/W2975712713","https://openalex.org/W2991040477","https://openalex.org/W3006131567","https://openalex.org/W3012479151","https://openalex.org/W3015963989","https://openalex.org/W3035078899","https://openalex.org/W3081168214","https://openalex.org/W3085139254","https://openalex.org/W3101104221","https://openalex.org/W3129831491","https://openalex.org/W3132107458","https://openalex.org/W3163465001","https://openalex.org/W3174394143","https://openalex.org/W3198659451","https://openalex.org/W3205803342","https://openalex.org/W4224330420","https://openalex.org/W4226479682","https://openalex.org/W4287756266","https://openalex.org/W4293404878","https://openalex.org/W4301361180","https://openalex.org/W6637709462","https://openalex.org/W6638632666","https://openalex.org/W6638783484","https://openalex.org/W6639703010","https://openalex.org/W6677580257","https://openalex.org/W6684859321","https://openalex.org/W6700264148","https://openalex.org/W6703652217","https://openalex.org/W6717768942","https://openalex.org/W6745245109","https://openalex.org/W6748645090","https://openalex.org/W6752790953","https://openalex.org/W6755207826","https://openalex.org/W6756009870","https://openalex.org/W6756488470","https://openalex.org/W6763701032","https://openalex.org/W6767997687","https://openalex.org/W6768723914","https://openalex.org/W6772383348","https://openalex.org/W6774125022","https://openalex.org/W6779999780","https://openalex.org/W6781494818","https://openalex.org/W6784425352","https://openalex.org/W6787953186","https://openalex.org/W6794212170","https://openalex.org/W6795581714","https://openalex.org/W6799372109","https://openalex.org/W6810802675","https://openalex.org/W6811928498","https://openalex.org/W6838632916"],"related_works":["https://openalex.org/W2388464034","https://openalex.org/W2533125852","https://openalex.org/W2140460949","https://openalex.org/W2105580438","https://openalex.org/W2057435755","https://openalex.org/W2018782216","https://openalex.org/W2949620858","https://openalex.org/W2770877918","https://openalex.org/W1989375655","https://openalex.org/W2805904889"],"abstract_inverted_index":{"It":[0],"remains":[1],"challenging":[2],"to":[3,15,96,119,122,158,223],"train":[4,224],"billion-scale":[5,58],"DNN":[6,181],"models":[7,226],"on":[8,126,186],"a":[9,46,76,107,127],"single":[10],"modern":[11,188],"multi-GPU":[12,49],"server":[13],"due":[14],"the":[16,33,53,68,82,150,156,192,210,216,228],"GPU":[17,54,138,189],"memory":[18,42,55,139,143,218],"wall.":[19],"Unfortunately,":[20],"existing":[21],"memory-saving":[22,72],"techniques":[23,73],"such":[24],"as":[25],"GPU-CPU":[26],"swap,":[27,95],"recomputation,":[28],"and":[29,74,90,94,100,141,168,184,194],"ZeRO-Series":[30,214],"come":[31],"at":[32],"price":[34],"of":[35,57,70],"extra":[36,63],"computation,":[37],"communication":[38,88],"overhead,":[39],"or":[40,201],"limited":[41],"reduction.We":[43],"present":[44],"MPress,":[45],"new":[47],"single-server":[48],"system":[50],"that":[51,130,206],"breaks":[52],"wall":[56],"model":[59,102],"training":[60,98,133,174,211],"while":[61,220],"minimizing":[62],"cost.":[64],"MPress":[65,105,165,207],"first":[66],"discusses":[67],"trade-offs":[69],"various":[71],"offers":[75],"holistic":[77],"solution,":[78],"which":[79,113],"alternatively":[80],"chooses":[81],"inter-operator":[83,131,172],"parallelism":[84],"with":[85,92,166,178,198,215],"low":[86],"cross-GPU":[87],"traffics,":[89],"combines":[91],"recomputation":[93,229],"balance":[97],"performance":[99],"sustained":[101],"sizes.":[103],"Additionally,":[104],"employs":[106],"novel,":[108],"fast":[109],"D2D":[110],"swap":[111,120],"technique,":[112],"simultaneously":[114],"utilizes":[115],"multiple":[116],"high-bandwidth":[117],"NVLink":[118],"tensors":[121],"light-load":[123],"GPUs,":[124],"based":[125],"key":[128],"observation":[129],"parallel":[132,173],"may":[134],"result":[135],"in":[136],"imbalanced":[137],"utilization":[140],"spare":[142],"space":[144],"from":[145,191],"least":[146],"used":[147],"devices":[148],"plus":[149],"high-end":[151],"interconnects":[152],"among":[153],"them":[154],"have":[155],"opportunity":[157],"support":[159],"low-overhead":[160],"swapping.":[161],"Finally,":[162],"we":[163],"integrate":[164],"PipeDream":[167],"DAPPLE,":[169],"two":[170,179,187],"representative":[171],"systems.":[175],"Experimental":[176],"results":[177],"popular":[180],"models,":[182],"Bert,":[183],"GPT,":[185],"servers":[190],"DGX-1":[193],"DGX-2":[195],"generation,":[196],"equipped":[197],"8":[199],"V100":[200],"A100":[202],"cards,":[203],"respectively,":[204],"demonstrate":[205],"significantly":[208],"improves":[209],"throughput":[212],"over":[213],"identical":[217],"reduction,":[219],"being":[221],"able":[222],"larger":[225],"than":[227],"baseline.":[230]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":1}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2025-10-10T00:00:00"}
