{"id":"https://openalex.org/W4414198736","doi":"https://doi.org/10.1109/dac63849.2025.11133274","title":"HybriMoE: Hybrid CPU-GPU Scheduling and Cache Management for Efficient MoE Inference","display_name":"HybriMoE: Hybrid CPU-GPU Scheduling and Cache Management for Efficient MoE Inference","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414198736","doi":"https://doi.org/10.1109/dac63849.2025.11133274"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11133274","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11133274","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081222727","display_name":"Shuzhang Zhong","orcid":null},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Shuzhang Zhong","raw_affiliation_strings":["Institute for Artificial Intelligence, Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Peking University,Beijing,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103031318","display_name":"Yao Sun","orcid":"https://orcid.org/0000-0002-0178-3612"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanfan Sun","raw_affiliation_strings":["Beihang University,School of Computer Science and Engineering,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Beihang University,School of Computer Science and Engineering,Beijing,China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064892269","display_name":"Ling Liang","orcid":"https://orcid.org/0000-0002-8534-6494"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ling Liang","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002760019","display_name":"Runsheng Wang","orcid":"https://orcid.org/0000-0002-7514-0767"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runsheng Wang","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062886480","display_name":"Ru Huang","orcid":"https://orcid.org/0000-0002-8146-4821"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ru Huang","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100457502","display_name":"Meng Li","orcid":"https://orcid.org/0000-0002-7212-2264"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Li","raw_affiliation_strings":["Institute for Artificial Intelligence, Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Institute for Artificial Intelligence, Peking University,Beijing,China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5081222727"],"corresponding_institution_ids":["https://openalex.org/I20231570","https://openalex.org/I4210100255"],"apc_list":null,"apc_paid":null,"fwci":3.3332,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.93500595,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9745000004768372,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9589999914169312,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9057000279426575,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.722599983215332},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6312999725341797},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6082000136375427},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.47200000286102295},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.4404999911785126},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.43939998745918274},{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.39340001344680786},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.3447999954223633}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.84579998254776},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.722599983215332},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6312999725341797},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6082000136375427},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.47200000286102295},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.4404999911785126},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.43939998745918274},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4025000035762787},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.39340001344680786},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3596999943256378},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.3447999954223633},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.32330000400543213},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.31310001015663147},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C142614401","wikidata":"https://www.wikidata.org/wiki/Q777433","display_name":"Forward chaining","level":3,"score":0.28870001435279846},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.28209999203681946},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.27230000495910645},{"id":"https://openalex.org/C49020025","wikidata":"https://www.wikidata.org/wiki/Q1059099","display_name":"Chaining","level":2,"score":0.26669999957084656},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.2621000111103058},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.25870001316070557},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2572000026702881}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11133274","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11133274","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2066334462","https://openalex.org/W4321636575","https://openalex.org/W4389524418","https://openalex.org/W4401211627","https://openalex.org/W4402671950","https://openalex.org/W4403006781","https://openalex.org/W4403337153","https://openalex.org/W4404401018","https://openalex.org/W4405755183"],"related_works":[],"abstract_inverted_index":{"The":[0],"Mixture":[1],"of":[2,73,177,199],"Experts":[3],"(MoE)":[4],"architecture":[5],"has":[6,50],"demonstrated":[7],"significant":[8,45],"advantages":[9],"as":[10],"it":[11,183],"enables":[12],"to":[13,53,57,102,148,167,216],"increase":[14,21],"the":[15,25,69,80,89,92,103,178,205,212],"model":[16,28],"capacity":[17],"without":[18],"a":[19,122,132,143,163],"proportional":[20],"in":[22,84,116,204,211],"computation.":[23],"However,":[24],"large":[26],"MoE":[27,74,97,219],"size":[29],"still":[30],"introduces":[31,141],"substantial":[32],"memory":[33],"demands,":[34],"which":[35],"usually":[36],"requires":[37],"expert":[38,59,70,105,169],"offloading":[39],"on":[40,66,88,175,184],"resource-constrained":[41],"platforms":[42],"and":[43,136,153,161,181,208],"incurs":[44],"overhead.":[46],"Hybrid":[47],"CPU-GPU":[48,94,124,134],"inference":[49,125,220],"been":[51],"proposed":[52],"leverage":[54],"CPU":[55,152],"computation":[56],"reduce":[58],"loading":[60],"overhead":[61],"but":[62],"faces":[63],"major":[64],"challenges:":[65],"one":[67],"hand,":[68,91],"activation":[71,170],"patterns":[72],"models":[75],"are":[76],"highly":[77],"unstable,":[78],"rendering":[79],"fixed":[81],"mapping":[82],"strategies":[83],"existing":[85],"works":[86],"inefficient;":[87],"other":[90],"hybrid":[93,123,218],"schedule":[95],"for":[96],"is":[98,224],"inherently":[99],"complex":[100],"due":[101],"diverse":[104],"sizes,":[106],"structures,":[107],"uneven":[108],"workload":[109],"distribution,":[110],"etc.":[111],"To":[112],"address":[113],"these":[114],"challenges,":[115],"this":[117],"paper,":[118],"we":[119],"propose":[120],"HybriMoE,":[121],"framework":[126,180],"that":[127,193],"improves":[128],"resource":[129],"utilization":[130],"through":[131],"novel":[133],"scheduling":[135,146],"cache":[137],"management":[138],"system.":[139],"HybriMoE":[140,174,194],"(i)":[142],"dynamic":[144],"intra-layer":[145],"strategy":[147],"balance":[149],"workloads":[150],"across":[151],"GPU,":[154],"(ii)":[155],"an":[156,196],"impact-driven":[157],"inter-layer":[158],"prefetching":[159],"algorithm,":[160],"(iii)":[162],"score-based":[164],"caching":[165],"algorithm":[166],"mitigate":[168],"instability.":[171],"We":[172],"implement":[173],"top":[176],"kTransformers":[179],"evaluate":[182],"three":[185],"widely":[186],"used":[187],"MoE-based":[188],"LLMs.":[189],"Experimental":[190],"results":[191],"demonstrate":[192],"achieves":[195],"average":[197],"speedup":[198],"$\\mathbf{1.":[200],"3":[201],"3}":[202],"\\times$":[203,210],"prefill":[206],"stage":[207,214],"$1.70":[209],"decode":[213],"compared":[215],"state-of-the-art":[217],"framework.":[221],"Our":[222],"code":[223],"available":[225],"at:":[226],"https://github.com/PKU-SEC-Lab/HybriMoE.":[227]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
