{"id":"https://openalex.org/W7133540384","doi":"https://doi.org/10.1109/hpca68181.2026.11408556","title":"eGPU: Production-Scale Elastic Sharing Over 10,000 GPUs","display_name":"eGPU: Production-Scale Elastic Sharing Over 10,000 GPUs","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133540384","doi":"https://doi.org/10.1109/hpca68181.2026.11408556"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408556","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408556","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xiaochuan Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Xiaochuan Tang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100324423","display_name":"Jie He","orcid":"https://orcid.org/0000-0002-2309-1596"},"institutions":[{"id":"https://openalex.org/I156087764","display_name":"University of California, Merced","ror":"https://ror.org/00d9ah105","country_code":"US","type":"education","lineage":["https://openalex.org/I156087764"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hao Qi","raw_affiliation_strings":["University of California,Merced"],"affiliations":[{"raw_affiliation_string":"University of California,Merced","institution_ids":["https://openalex.org/I156087764"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100743206","display_name":"Jianbo Dong","orcid":"https://orcid.org/0000-0003-0939-8943"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jianbo Dong","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023686830","display_name":"Yinghao Yu","orcid":"https://orcid.org/0000-0002-2744-845X"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yinghao Yu","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100298601","display_name":"Zhennan Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhennan Xue","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128102687","display_name":"Zhengyu Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhengyu Zhang","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062105355","display_name":"Daocheng Ying","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daocheng Ying","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031366899","display_name":"Zheng Cao","orcid":"https://orcid.org/0000-0002-0968-1864"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zheng Cao","raw_affiliation_strings":["Alibaba Group"],"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059976941","display_name":"Xiaoyi Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I156087764","display_name":"University of California, Merced","ror":"https://ror.org/00d9ah105","country_code":"US","type":"education","lineage":["https://openalex.org/I156087764"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaoyi Lu","raw_affiliation_strings":["University of California,Merced"],"affiliations":[{"raw_affiliation_string":"University of California,Merced","institution_ids":["https://openalex.org/I156087764"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210095624"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.48334466,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"14"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4560000002384186,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4560000002384186,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.08789999783039093,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.050200000405311584,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/deformation","display_name":"Deformation (meteorology)","score":0.3174999952316284},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.29580000042915344},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.2476000040769577}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5594000220298767},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3513000011444092},{"id":"https://openalex.org/C204366326","wikidata":"https://www.wikidata.org/wiki/Q3027650","display_name":"Deformation (meteorology)","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.2892000079154968},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2687999904155731},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.25609999895095825},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2547000050544739},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2476000040769577}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408556","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408556","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.5851765871047974}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1502544429","https://openalex.org/W1939358748","https://openalex.org/W2000335122","https://openalex.org/W2071039340","https://openalex.org/W2113639928","https://openalex.org/W2124365587","https://openalex.org/W2153882824","https://openalex.org/W2194775991","https://openalex.org/W2290712622","https://openalex.org/W2493061014","https://openalex.org/W2608738900","https://openalex.org/W2613409207","https://openalex.org/W2750961421","https://openalex.org/W2982157693","https://openalex.org/W3197816522","https://openalex.org/W4211140533","https://openalex.org/W4291288469","https://openalex.org/W4321096498","https://openalex.org/W4388662057","https://openalex.org/W4394944658"],"related_works":[],"abstract_inverted_index":{"As":[0],"the":[1,22,183],"cost":[2],"of":[3,28,78,185],"GPUs":[4,79,186],"continues":[5],"to":[6,125,176,190],"rise,":[7],"GPU-sharing":[8,61],"solutions":[9,30],"have":[10],"become":[11],"increasingly":[12],"important":[13],"for":[14,64,153],"improving":[15],"efficiency":[16,172],"and":[17,43,59,71,88,133,164,195],"maximizing":[18],"resource":[19,86],"utilization.":[20],"At":[21],"same":[23],"time,":[24],"large-scale":[25,127],"operational":[26],"deployments":[27],"such":[29],"remain":[31],"relatively":[32],"less":[33],"explored,":[34],"especially":[35],"in":[36,110,118,136],"heterogeneous":[37],"production":[38,116,137],"environments":[39],"where":[40],"workload":[41],"dynamics":[42],"orchestration":[44],"complexity":[45],"introduce":[46,54],"new":[47],"practical":[48],"considerations.":[49],"In":[50],"this":[51],"paper,":[52],"we":[53],"eGPU,":[55],"an":[56],"elastic,":[57],"efficient,":[58],"scalable":[60],"framework":[62],"tailored":[63],"production-scale":[65],"concurrent":[66],"machine":[67],"learning":[68],"(ML)":[69],"training":[70],"inference.":[72],"eGPU":[73,95,120,161],"enables":[74],"fine-grained,":[75],"runtime-adjustable":[76],"sharing":[77,180],"across":[80],"multiple":[81],"jobs,":[82],"while":[83],"preserving":[84],"high":[85],"utilization":[87,199],"fault":[89],"isolation.":[90],"To":[91],"address":[92],"communication":[93,99],"bottlenecks,":[94],"supports":[96],"native":[97],"NVLink/NCCL-based":[98],"between":[100],"shared":[101],"GPU":[102,198],"instances,":[103],"capabilities":[104],"that":[105,160],"are":[106],"limited":[107],"or":[108],"unavailable":[109],"many":[111],"existing":[112],"designs.":[113],"Built":[114],"with":[115,122,139],"deployment":[117],"mind,":[119],"integrates":[121],"Kubernetes":[123],"(K8s)":[124],"support":[126],"orchestration.":[128],"It":[129],"has":[130],"been":[131],"deployed":[132],"running":[134],"stably":[135],"clusters":[138],"over":[140,167],"<tex":[141,191,203],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[142,192,204],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\text{1":[143],"0,":[144],"0":[145,146,147],"~":[148],"G":[149],"P":[150],"U":[151],"s}$</tex>":[152],"five":[154],"years.":[155],"Our":[156],"evaluation":[157],"results":[158],"show":[159],"achieves":[162],"elastic":[163],"precise":[165],"control":[166],"instance":[168],"sizes,":[169],"improves":[170,196],"job":[171],"by":[173,188,200],"21":[174],"%":[175],"31%":[177],"than":[178,202],"SOTA":[179],"solutions,":[181],"saves":[182],"number":[184],"required":[187],"up":[189],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$8":[193],"\\times$</tex>,":[194],"cluster":[197],"more":[201],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\mathrm{3}":[205],"\\times$</tex>.":[206]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-03-05T00:00:00"}
