{"id":"https://openalex.org/W7134990851","doi":"https://doi.org/10.1109/asp-dac66049.2026.11420335","title":"Efficient CPU-GPU Collaborative Inference for MoE-based LLMs on Memory-Limited Systems","display_name":"Efficient CPU-GPU Collaborative Inference for MoE-based LLMs on Memory-Limited Systems","publication_year":2026,"publication_date":"2026-01-19","ids":{"openalex":"https://openalex.org/W7134990851","doi":"https://doi.org/10.1109/asp-dac66049.2026.11420335"},"language":null,"primary_location":{"id":"doi:10.1109/asp-dac66049.2026.11420335","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asp-dac66049.2026.11420335","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 31st Asia and South Pacific Design Automation Conference (ASP-DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067971879","display_name":"En-Ming Huang","orcid":"https://orcid.org/0000-0003-2196-2834"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"En-Ming Huang","raw_affiliation_strings":["National Taiwan University,Taipei,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taipei,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Li-Shang Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Li-Shang Lin","raw_affiliation_strings":["National Tsing Hua University,Hsinchu,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Tsing Hua University,Hsinchu,Taiwan","institution_ids":["https://openalex.org/I25846049"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028600832","display_name":"Chun\u2010Yi Lee","orcid":"https://orcid.org/0000-0002-4680-4800"},"institutions":[{"id":"https://openalex.org/I16733864","display_name":"National Taiwan University","ror":"https://ror.org/05bqach95","country_code":"TW","type":"education","lineage":["https://openalex.org/I16733864"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chun-Yi Lee","raw_affiliation_strings":["National Taiwan University,Taipei,Taiwan"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Taiwan University,Taipei,Taiwan","institution_ids":["https://openalex.org/I16733864"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30757752,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"333","last_page":"340"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4235999882221222,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4235999882221222,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.07930000126361847,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.07559999823570251,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4747999906539917},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3172000050544739},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.2558000087738037},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.2547999918460846}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5715000033378601},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4747999906539917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4235000014305115},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3172000050544739},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.29019999504089355},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.28360000252723694},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.27570000290870667},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2752000093460083},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asp-dac66049.2026.11420335","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asp-dac66049.2026.11420335","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 31st Asia and South Pacific Design Automation Conference (ASP-DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W1988888548","https://openalex.org/W4321636575","https://openalex.org/W4385245566","https://openalex.org/W4401211627","https://openalex.org/W4402683990","https://openalex.org/W4403006781"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"achieved":[5],"impressive":[6],"results":[7],"across":[8],"various":[9],"tasks,":[10],"yet":[11],"their":[12],"high":[13],"computational":[14],"demands":[15],"pose":[16],"deployment":[17],"challenges,":[18],"especially":[19],"on":[20,88,142],"consumergrade":[21,143],"hardware.":[22],"Mixture":[23],"of":[24,35,121,131,147],"Experts":[25],"(MoE)":[26],"models":[27,47],"provide":[28],"an":[29,84],"efficient":[30,109],"solution":[31],"through":[32,100],"selective":[33],"activation":[34],"parameter":[36],"subsets,":[37],"which":[38,113],"reduces":[39],"computation":[40],"requirements.":[41],"Despite":[42],"this":[43],"efficiency,":[44],"state-of-the-art":[45],"MoE":[46],"still":[48],"require":[49],"substantial":[50],"memory":[51],"beyond":[52],"typical":[53],"consumer":[54],"GPU":[55,67,90],"capacities.":[56],"Traditional":[57],"offloading":[58],"methods":[59],"that":[60,82],"transfer":[61,94],"model":[62],"weights":[63],"between":[64],"CPU":[65,107,116],"and":[66,96,127],"introduce":[68],"latency,":[69],"limiting":[70],"inference":[71,80,99,140],"performance.":[72],"This":[73],"paper":[74],"presents":[75],"a":[76],"novel":[77],"CPU-GPU":[78,132],"collaborative":[79],"framework":[81,123,149],"incorporates":[83],"expert":[85],"caching":[86],"mechanism":[87],"the":[89,129],"to":[91,106,134],"reduce":[92],"data":[93],"requirements":[95],"enable":[97],"faster":[98],"cache":[101,110],"hits.":[102],"Computations":[103],"are":[104],"offloaded":[105],"for":[108,138],"miss":[111],"handling,":[112],"benefits":[114],"from":[115],"multithreading":[117],"optimizations.":[118],"The":[119,145],"evaluations":[120],"our":[122,148],"demonstrate":[124],"performance":[125],"improvements":[126],"highlight":[128],"potential":[130],"collaboration":[133],"maximize":[135],"hardware":[136],"utilization":[137],"single-request":[139],"scenarios":[141],"systems.":[144],"implementation":[146],"is":[150],"available":[151],"at":[152],"github.com/elsa-lab/MoE-CPU-GPU-Collaborative-Inference.":[153]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-12T00:00:00"}
