{"id":"https://openalex.org/W7134927389","doi":"https://doi.org/10.1145/3779212.3790187","title":"MoE-APEX: An Efficient MoE Inference System with Adaptive Precision Expert Offloading","display_name":"MoE-APEX: An Efficient MoE Inference System with Adaptive Precision Expert Offloading","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134927389","doi":"https://doi.org/10.1145/3779212.3790187"},"language":null,"primary_location":{"id":"doi:10.1145/3779212.3790187","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790187","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3779212.3790187","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128719323","display_name":"Peng Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Peng Tang","raw_affiliation_strings":["Shanghai Jiaotong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-8196-3953","affiliations":[{"raw_affiliation_string":"Shanghai Jiaotong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100656307","display_name":"Jiacheng Liu","orcid":"https://orcid.org/0000-0003-0378-2311"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Jiacheng Liu","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0003-0378-2311","affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128767323","display_name":"Xiaofeng Hou","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaofeng Hou","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-4372-7851","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116969385","display_name":"Yifei Pu","orcid":"https://orcid.org/0009-0000-7220-1643"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifei Pu","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0000-7220-1643","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128783306","display_name":"Jing Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Wang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-7260-0521","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128758516","display_name":"Pheng-Ann Heng","orcid":null},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Pheng-Ann Heng","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0003-3055-5034","affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128699231","display_name":"Chao Li","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Li","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0001-6218-4659","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":null,"display_name":"Minyi Guo","orcid":"https://orcid.org/0000-0003-0034-2302"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minyi Guo","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-0034-2302","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128719323"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40245875,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1185","last_page":"1200"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.15600000321865082,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.15600000321865082,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.1298000067472458,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.11330000311136246,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6870999932289124},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.667900025844574},{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.6434999704360962},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6136999726295471},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5248000025749207},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.43560001254081726},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.42809998989105225},{"id":"https://openalex.org/keywords/subject-matter-expert","display_name":"Subject-matter expert","score":0.40119999647140503}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7781999707221985},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6870999932289124},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.667900025844574},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.6434999704360962},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6136999726295471},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5248000025749207},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.43560001254081726},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.42809998989105225},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41519999504089355},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4068000018596649},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.40119999647140503},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.38830000162124634},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35440000891685486},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.3400999903678894},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.3203999996185303},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.3098999857902527},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.30149999260902405},{"id":"https://openalex.org/C149091818","wikidata":"https://www.wikidata.org/wiki/Q2429814","display_name":"Software system","level":3,"score":0.28859999775886536},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C17458331","wikidata":"https://www.wikidata.org/wiki/Q935672","display_name":"Spawn (biology)","level":2,"score":0.2646999955177307},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.263700008392334},{"id":"https://openalex.org/C52970973","wikidata":"https://www.wikidata.org/wiki/Q2497134","display_name":"Adaptive system","level":2,"score":0.25999999046325684},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2596000134944916},{"id":"https://openalex.org/C186108316","wikidata":"https://www.wikidata.org/wiki/Q352530","display_name":"Adaptive neuro fuzzy inference system","level":4,"score":0.25839999318122864},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.257999986410141},{"id":"https://openalex.org/C86610423","wikidata":"https://www.wikidata.org/wiki/Q1925081","display_name":"Metamodeling","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C102600418","wikidata":"https://www.wikidata.org/wiki/Q6517507","display_name":"Legal expert system","level":3,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3779212.3790187","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790187","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3779212.3790187","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790187","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.5087746381759644}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W2092845679","https://openalex.org/W2150884987","https://openalex.org/W3205803342","https://openalex.org/W4387321091","https://openalex.org/W4403006781"],"related_works":[],"abstract_inverted_index":{"Mixture-of-experts":[0],"(MoE)":[1],"architectures":[2,53],"enable":[3,115],"scalable":[4],"Large":[5],"Language":[6],"Models":[7],"(LLMs)":[8],"with":[9,69],"reduced":[10],"computational":[11],"overhead,":[12],"yet":[13],"their":[14],"deployment":[15,155],"on":[16],"memory-constrained":[17],"edge":[18,52,146],"devices":[19],"is":[20,62],"hindered":[21],"by":[22,54],"substantial":[23],"memory":[24,30],"demands.":[25],"Traditional":[26],"expert-offloading":[27],"techniques":[28,82],"mitigate":[29],"constraints":[31],"but":[32],"often":[33],"significantly":[34],"increase":[35],"inference":[36,50,124],"latency.":[37],"We":[38],"introduce":[39],"MoE-APEX,":[40],"an":[41],"Adaptive":[42],"Precision":[43],"EXpert":[44],"offloading":[45,142],"system":[46],"that":[47,83],"optimizes":[48],"MoE":[49,89,141,154],"for":[51,152],"dynamically":[55],"managing":[56],"expert":[57,95,102,110,123],"precision.":[58],"Our":[59],"core":[60],"innovation":[61],"to":[63,117,136,139],"replace":[64],"less":[65],"critical":[66],"cache-miss":[67],"experts":[68],"low-precision":[70],"variants,":[71],"reducing":[72],"loading":[73,96],"latency":[74],"while":[75],"maintaining":[76],"accuracy.":[77],"MoE-APEX":[78,116,129],"introduces":[79],"three":[80],"innovative":[81],"map":[84],"the":[85,119],"natural":[86],"hierarchy":[87],"of":[88,121],"computation:":[90],"(1)":[91],"a":[92,99,107,149],"token-level":[93],"dynamic":[94],"mechanism,":[97],"(2)":[98],"layer-level":[100],"adaptive":[101],"prefetching":[103],"technique,":[104],"and":[105],"(3)":[106],"sequence-level":[108],"cost-aware":[109],"caching":[111],"policy.":[112],"These":[113],"innovations":[114],"leverage":[118],"benefits":[120],"mixed-precision":[122],"fully.":[125],"Implemented":[126],"atop":[127],"Llama.cpp,":[128],"achieves":[130],"decoding":[131],"speedups":[132],"ranging":[133],"from":[134],"1.34x":[135],"9.75x":[137],"compared":[138],"state-of-the-art":[140],"systems":[143],"across":[144],"diverse":[145],"devices,":[147],"offering":[148],"robust":[150],"solution":[151],"efficient":[153],"in":[156],"resource-constrained":[157],"environments.":[158]},"counts_by_year":[],"updated_date":"2026-03-12T06:18:43.230356","created_date":"2026-03-12T00:00:00"}
