{"id":"https://openalex.org/W7160660520","doi":"https://doi.org/10.48550/arxiv.2605.05888","title":"MoE-Hub: Taming Software Complexity for Seamless MoE Overlap with Hardware-Accelerated Communication on Multi-GPU Systems","display_name":"MoE-Hub: Taming Software Complexity for Seamless MoE Overlap with Hardware-Accelerated Communication on Multi-GPU Systems","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160660520","doi":"https://doi.org/10.48550/arxiv.2605.05888"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05888","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128051850","display_name":"Zhuoshan Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhuoshan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135647065","display_name":"Chen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100602366","display_name":"Shuyi Zhang","orcid":"https://orcid.org/0000-0002-1501-7847"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shuyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135684844","display_name":"Qijun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135682436","display_name":"Haibo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135657289","display_name":"Zhe Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049071763","display_name":"Zhipeng Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Zhipeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135698802","display_name":"Guangyu Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Guangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080651207","display_name":"Yijia Diao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diao, Yijia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135696977","display_name":"Zhigang Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Zhigang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003939279","display_name":"Jingwen Leng","orcid":"https://orcid.org/0000-0002-5660-5493"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leng, Jingwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135723352","display_name":"Guanghui He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Guanghui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135688432","display_name":"Minyi Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Minyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7534000277519226,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7534000277519226,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.044599998742341995,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.042399998754262924,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6011000275611877},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5672000050544739},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4505999982357025},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.4438999891281128},{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.43869999051094055},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4198000133037567},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.38850000500679016},{"id":"https://openalex.org/keywords/models-of-communication","display_name":"Models of communication","score":0.37619999051094055},{"id":"https://openalex.org/keywords/data-transmission","display_name":"Data transmission","score":0.3691999912261963},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.34049999713897705}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8507000207901001},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6011000275611877},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5698999762535095},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5672000050544739},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4505999982357025},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.4438999891281128},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.43869999051094055},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4198000133037567},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3952000141143799},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.3919000029563904},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.38850000500679016},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.37619999051094055},{"id":"https://openalex.org/C557945733","wikidata":"https://www.wikidata.org/wiki/Q389772","display_name":"Data transmission","level":2,"score":0.3691999912261963},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.34049999713897705},{"id":"https://openalex.org/C12269588","wikidata":"https://www.wikidata.org/wiki/Q132364","display_name":"Communications protocol","level":2,"score":0.3336000144481659},{"id":"https://openalex.org/C77270119","wikidata":"https://www.wikidata.org/wiki/Q1655198","display_name":"Software-defined networking","level":2,"score":0.3296999931335449},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.32249999046325684},{"id":"https://openalex.org/C174683762","wikidata":"https://www.wikidata.org/wiki/Q609588","display_name":"Component-based software engineering","level":4,"score":0.3224000036716461},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.3127000033855438},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.31150001287460327},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3102000057697296},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.3068999946117401},{"id":"https://openalex.org/C761482","wikidata":"https://www.wikidata.org/wiki/Q118093","display_name":"Transmission (telecommunications)","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2992999851703644},{"id":"https://openalex.org/C147358964","wikidata":"https://www.wikidata.org/wiki/Q1200992","display_name":"Abstraction layer","level":3,"score":0.29159998893737793},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2842999994754791},{"id":"https://openalex.org/C35869016","wikidata":"https://www.wikidata.org/wiki/Q846636","display_name":"Software architecture","level":3,"score":0.28200000524520874},{"id":"https://openalex.org/C179145077","wikidata":"https://www.wikidata.org/wiki/Q5154130","display_name":"Communication complexity","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C76518257","wikidata":"https://www.wikidata.org/wiki/Q271680","display_name":"Software framework","level":5,"score":0.26409998536109924},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C83529365","wikidata":"https://www.wikidata.org/wiki/Q4972103","display_name":"Broadcast communication network","level":2,"score":0.2556999921798706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"Mixture-of-Experts":[1],"(MoE)":[2],"architecture":[3],"is":[4,14,29],"crucial":[5],"for":[6],"scaling":[7],"large":[8],"language":[9],"models,":[10],"but":[11],"its":[12,34],"scalability":[13],"severely":[15],"limited":[16],"by":[17,137],"inter-GPU":[18],"communication":[19,26,70,106,148],"bottlenecks":[20],"in":[21,41,140],"multi-GPU":[22],"systems.":[23,171],"Although":[24],"overlapping":[25],"with":[27],"computation":[28],"a":[30,56,77,99,104,125],"widely":[31],"recognized":[32],"optimization,":[33],"effective":[35],"deployment":[36],"still":[37],"remains":[38],"challenging,":[39],"both":[40],"terms":[42],"of":[43,72],"performance":[44,89],"and":[45,66,90,131,154,165],"programmability.":[46],"In":[47],"this":[48],"work,":[49],"we":[50,96],"identify":[51],"the":[52,67,141,146],"root":[53],"cause":[54],"as":[55],"fundamental":[57],"abstraction":[58],"mismatch":[59],"between":[60],"MoE's":[61],"dynamic,":[62],"irregular":[63],"token-to-expert":[64],"mapping":[65],"static,":[68],"address-centric":[69],"model":[71],"modern":[73],"GPUs,":[74],"which":[75],"necessitates":[76],"complex":[78],"software":[79,91],"mediation":[80],"phase":[81],"to":[82,117],"resolve":[83,94],"addresses":[84],"before":[85],"data":[86,110,119],"transfers,":[87],"limiting":[88],"flexibility.":[92],"To":[93],"this,":[95],"propose":[97],"MoE-Hub,":[98],"hardware-software":[100],"co-design":[101],"that":[102,160],"introduces":[103],"destination-agnostic":[105],"paradigm.":[107],"MoE-Hub":[108,151,161],"decouples":[109],"transmission":[111],"from":[112],"address":[113,129],"management,":[114],"allowing":[115],"producers":[116],"send":[118],"immediately":[120],"after":[121],"routing":[122],"using":[123],"only":[124],"logical":[126],"destination,":[127],"while":[128],"allocation":[130],"data-flow":[132],"orchestration":[133],"are":[134],"handled":[135],"transparently":[136],"lightweight":[138],"hardware":[139],"GPU":[142],"hub.":[143],"By":[144],"hardware-accelerating":[145],"entire":[147],"control":[149],"plane,":[150],"enables":[152],"seamless":[153],"transparent":[155],"overlap.":[156],"Our":[157],"evaluation":[158],"shows":[159],"achieves":[162],"1.40x-3.08x":[163],"per-layer":[164],"1.21x-1.98x":[166],"end-to-end":[167],"speedup":[168],"over":[169],"state-of-the-art":[170]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
