{"id":"https://openalex.org/W4416429509","doi":"https://doi.org/10.1109/iccad66269.2025.11240984","title":"HD-MoE: Hybrid and Dynamic Parallelism for Mixture-of-Expert LLMs with 3D Near-Memory Processing","display_name":"HD-MoE: Hybrid and Dynamic Parallelism for Mixture-of-Expert LLMs with 3D Near-Memory Processing","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W4416429509","doi":"https://doi.org/10.1109/iccad66269.2025.11240984"},"language":null,"primary_location":{"id":"doi:10.1109/iccad66269.2025.11240984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240984","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079747680","display_name":"Hao\u2010Chen Huang","orcid":"https://orcid.org/0000-0001-6130-5981"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haochen Huang","raw_affiliation_strings":["Peking University,Institute for Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Institute for Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081222727","display_name":"Shuzhang Zhong","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuzhang Zhong","raw_affiliation_strings":["Peking University,Institute for Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Institute for Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100443009","display_name":"Zhe Zhang","orcid":"https://orcid.org/0000-0002-7793-6574"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhe Zhang","raw_affiliation_strings":["Alibaba Group,DAMO Academy,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,DAMO Academy,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102837743","display_name":"Shuangchen Li","orcid":"https://orcid.org/0009-0003-6986-0463"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuangchen Li","raw_affiliation_strings":["Alibaba Group,DAMO Academy,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,DAMO Academy,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068606980","display_name":"Dimin Niu","orcid":"https://orcid.org/0000-0001-8440-3875"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dimin Niu","raw_affiliation_strings":["Alibaba Group,DAMO Academy,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,DAMO Academy,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046896624","display_name":"Hongzhong Zheng","orcid":"https://orcid.org/0000-0001-7696-9799"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongzhong Zheng","raw_affiliation_strings":["Alibaba Group,DAMO Academy,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,DAMO Academy,Beijing,China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002760019","display_name":"Runsheng Wang","orcid":"https://orcid.org/0000-0002-7514-0767"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runsheng Wang","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100457502","display_name":"Meng Li","orcid":"https://orcid.org/0000-0002-7212-2264"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Li","raw_affiliation_strings":["Peking University,Institute for Artificial Intelligence,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Institute for Artificial Intelligence,Beijing,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5079747680"],"corresponding_institution_ids":["https://openalex.org/I20231570","https://openalex.org/I4210100255"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18751093,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12210000306367874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.12210000306367874,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.11640000343322754,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.11169999837875366,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6620000004768372},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.599399983882904},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5587999820709229},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.5577999949455261},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.40529999136924744},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.3720000088214874},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.3675000071525574},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.3630000054836273}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8282999992370605},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7006000280380249},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6620000004768372},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.599399983882904},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5587999820709229},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.5577999949455261},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.41589999198913574},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.40529999136924744},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3720000088214874},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.3675000071525574},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.36660000681877136},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.3630000054836273},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.32030001282691956},{"id":"https://openalex.org/C106515295","wikidata":"https://www.wikidata.org/wiki/Q26806595","display_name":"Parallel processing","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C42992933","wikidata":"https://www.wikidata.org/wiki/Q691169","display_name":"Task parallelism","level":3,"score":0.2745000123977661},{"id":"https://openalex.org/C140763907","wikidata":"https://www.wikidata.org/wiki/Q2714055","display_name":"Instruction-level parallelism","level":3,"score":0.272599995136261},{"id":"https://openalex.org/C150495011","wikidata":"https://www.wikidata.org/wiki/Q128392","display_name":"Concurrent computing","level":2,"score":0.271699994802475},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.27129998803138733},{"id":"https://openalex.org/C120373497","wikidata":"https://www.wikidata.org/wiki/Q1087987","display_name":"Parallel algorithm","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.26080000400543213},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad66269.2025.11240984","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240984","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1999085092","https://openalex.org/W2009571103","https://openalex.org/W3096425133","https://openalex.org/W3136346557","https://openalex.org/W4220967350","https://openalex.org/W4220972538","https://openalex.org/W4285730081","https://openalex.org/W4387064011","https://openalex.org/W4401211878","https://openalex.org/W4403006781","https://openalex.org/W4410583035","https://openalex.org/W4411485931","https://openalex.org/W4414198736"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"with":[4,12,45,190],"Mixture-of-Expert":[5],"(MoE)":[6],"architectures":[7],"achieve":[8],"superior":[9],"model":[10],"performance":[11],"reduced":[13],"computation":[14,70,97,128,156],"costs,":[15],"but":[16],"at":[17],"the":[18,36,68,73,112,125,150,155,186],"cost":[19],"of":[20,107],"high":[21,43,46,92],"memory":[22,33,62],"capacity":[23],"and":[24,63,85,142,181],"bandwidth":[25,44],"requirements.":[26],"Near-Memory":[27],"Processing":[28],"(NMP)":[29],"accelerators":[30,59],"that":[31,164],"stack":[32],"directly":[34,71],"on":[35],"compute":[37],"through":[38],"hybrid":[39,138],"bonding":[40],"have":[41],"demonstrated":[42],"energy":[47],"efficiency,":[48],"becoming":[49],"a":[50,167],"promising":[51],"architecture":[52],"for":[53],"MoE":[54,69,108,126],"models.":[55],"However,":[56],"as":[57],"NMP":[58,131],"comprise":[60],"distributed":[61],"computation,":[64],"how":[65],"to":[66,100,122,148,172,177,183],"map":[67],"determines":[72],"LLM":[74],"inference":[75],"efficiency.":[76,102],"Existing":[77],"parallel":[78,127,139],"mapping":[79,140],"strategies,":[80],"including":[81],"Tensor":[82],"Parallelism":[83,87],"(TP)":[84],"Expert":[86],"(EP),":[88],"suffer":[89],"from":[90,170],"either":[91],"communication":[93,151],"costs":[94,152],"or":[95],"unbalanced":[96],"utilization,":[98],"leading":[99],"inferior":[101],"The":[103],"dynamic":[104,145],"routing":[105],"mechanism":[106],"LLMs":[109],"further":[110],"aggravates":[111],"efficiency":[113],"challenges.":[114],"Therefore,":[115],"in":[116],"this":[117],"paper,":[118],"we":[119,162],"propose":[120],"HD-MoE":[121,133,165],"automatically":[123],"optimize":[124],"across":[129],"an":[130,135,143],"accelerator.":[132],"features":[134],"offline":[136],"automatic":[137],"algorithm":[141],"online":[144],"scheduling":[146],"strategy":[147],"reduce":[149],"while":[153],"maximizing":[154],"utilization.":[157],"With":[158],"extensive":[159],"experimental":[160],"results,":[161],"demonstrate":[163],"achieves":[166],"speedup":[168],"ranging":[169],"1.1\u00d7":[171,176],"1.8\u00d7":[173],"over":[174,179,185],"TP,":[175],"1.5\u00d7":[178],"EP,":[180],"1.0\u00d7":[182],"1.4\u00d7":[184],"baseline":[187],"Hybrid":[188],"TP-EP":[189],"Compute-Balanced":[191],"parallelism":[192],"strategies.":[193]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-20T00:00:00"}
