{"id":"https://openalex.org/W7129510454","doi":"https://doi.org/10.1109/access.2026.3665697","title":"Two-Stage Expert Offloading for Domain-Aware MoE Inference","display_name":"Two-Stage Expert Offloading for Domain-Aware MoE Inference","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7129510454","doi":"https://doi.org/10.1109/access.2026.3665697"},"language":"en","primary_location":{"id":"doi:10.1109/access.2026.3665697","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3665697","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2026.3665697","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Hangyeol Kim","orcid":"https://orcid.org/0009-0001-6416-2741"},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hangyeol Kim","raw_affiliation_strings":["Department of Electrical Computer Engineering, Sungkyunkwan University, Suwon, South Korea"],"raw_orcid":"https://orcid.org/0009-0001-6416-2741","affiliations":[{"raw_affiliation_string":"Department of Electrical Computer Engineering, Sungkyunkwan University, Suwon, South Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101323096","display_name":"Honguk Woo","orcid":null},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Honguk Woo","raw_affiliation_strings":["Department of Computer Science and Engineering, Sungkyunkwan University, Suwon, South Korea"],"raw_orcid":"https://orcid.org/0000-0001-6948-3440","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, Sungkyunkwan University, Suwon, South Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5013775136","display_name":"Younghwan Kim","orcid":null},"institutions":[{"id":"https://openalex.org/I4210131650","display_name":"Korea Electronics Technology Institute","ror":"https://ror.org/039k6f508","country_code":"KR","type":"facility","lineage":["https://openalex.org/I2801339556","https://openalex.org/I4210089395","https://openalex.org/I4210131650"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Younghwan Kim","raw_affiliation_strings":["Intelligent IDC Project Office, Korea Electronics Technology Institute, Seongnam-si, Gyeonggi-do, South Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Intelligent IDC Project Office, Korea Electronics Technology Institute, Seongnam-si, Gyeonggi-do, South Korea","institution_ids":["https://openalex.org/I4210131650"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.20415894,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"14","issue":null,"first_page":"33610","last_page":"33624"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.15049999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.15049999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.10270000249147415,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.07829999923706055,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.788100004196167},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.746399998664856},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5785999894142151},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.559499979019165},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.484499990940094},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.4657000005245209},{"id":"https://openalex.org/keywords/subject-matter-expert","display_name":"Subject-matter expert","score":0.43560001254081726},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.37310001254081726}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8863000273704529},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.788100004196167},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.746399998664856},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5785999894142151},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.559499979019165},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.484499990940094},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.4657000005245209},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.43560001254081726},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.37310001254081726},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34950000047683716},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.3384999930858612},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33550000190734863},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.3230000138282776},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.31150001287460327},{"id":"https://openalex.org/C207685749","wikidata":"https://www.wikidata.org/wiki/Q2088941","display_name":"Domain knowledge","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.288100004196167},{"id":"https://openalex.org/C2994168587","wikidata":"https://www.wikidata.org/wiki/Q5295","display_name":"Random access memory","level":2,"score":0.27250000834465027},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.25380000472068787},{"id":"https://openalex.org/C147297375","wikidata":"https://www.wikidata.org/wiki/Q6674930","display_name":"Look-ahead","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/access.2026.3665697","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3665697","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:fe06fb5767fb4828837fc3d35479b22e","is_oa":true,"landing_page_url":"https://doaj.org/article/fe06fb5767fb4828837fc3d35479b22e","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 14, Pp 33610-33624 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/access.2026.3665697","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3665697","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2423750387","display_name":null,"funder_award_id":"RS-2023-00213118","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"}],"funders":[{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W2150884987","https://openalex.org/W2962739339","https://openalex.org/W3129831491","https://openalex.org/W4253028564","https://openalex.org/W4408017080"],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"architectures,":[2],"while":[3,137],"computationally":[4],"efficient,":[5],"suffer":[6],"from":[7],"a":[8,48,64,92,130,141,160,168,191],"critical":[9],"memory\u2013latency":[10],"bottleneck.":[11],"The":[12],"need":[13],"to":[14,22,30,51,78,167],"store":[15],"all":[16],"experts":[17,100],"in":[18,118,123,133,154,163],"GPU":[19,60,103],"memory":[20,86],"or":[21],"incur":[23],"high":[24],"I/O":[25,107,134],"latency":[26],"when":[27],"offloading":[28,146],"them":[29,190],"CPU":[31],"severely":[32],"hinders":[33],"their":[34],"practical":[35,181],"deployment.":[36],"To":[37],"address":[38],"this,":[39],"we":[40],"propose":[41],"ADEPT":[42,62,113],"(Adaptive":[43],"Domain-aware":[44],"Expert":[45],"Prefetching":[46],"Technique),":[47],"framework":[49],"designed":[50],"optimize":[52],"expert":[53],"management":[54],"during":[55,68,89],"LLM":[56],"inference":[57],"under":[58],"memory-constrained":[59],"environments.":[61],"employs":[63],"two-stage":[65],"strategy.":[66],"First,":[67],"the":[69,74,102,180],"prefill":[70],"phase,":[71],"it":[72,158],"analyzes":[73],"input\u2019s":[75],"semantic":[76],"domain":[77],"selectively":[79],"preload":[80],"only":[81],"relevant":[82],"experts,":[83],"minimizing":[84],"peak":[85,164],"usage.":[87],"Second,":[88],"token-by-token":[90],"decoding,":[91],"locality-aware":[93],"mechanism":[94],"predicts":[95],"and":[96,152],"preemptively":[97],"loads":[98],"upcoming":[99],"into":[101],"cache,":[104],"effectively":[105],"hiding":[106],"latency.":[108],"Evaluated":[109],"across":[110],"diverse":[111],"domains,":[112],"demonstrates":[114],"significant":[115],"efficiency":[116],"gains":[117],"two":[119],"key":[120],"aspects:":[121],"(1)":[122],"terms":[124,155],"of":[125,156,183],"latency,":[126],"simulation":[127],"results":[128],"show":[129],"50%":[131],"reduction":[132],"wait":[135],"time,":[136],"real-world":[138],"validation":[139],"confirms":[140],"3.45\u00d7":[142],"speedup":[143],"over":[144],"standard":[145],"baselines":[147],"by":[148],"mitigating":[149],"system":[150],"overheads":[151],"(2)":[153],"memory,":[157],"yields":[159],"33%":[161],"decrease":[162],"usage":[165],"compared":[166],"full-loading":[169],"approach.":[170],"Our":[171],"findings":[172],"demonstrate":[173],"that":[174],"this":[175],"principled":[176],"approach":[177],"significantly":[178],"improves":[179],"deployability":[182],"MoE":[184],"models":[185],"on":[186],"cost-effective":[187],"hardware,":[188],"making":[189],"more":[192],"viable":[193],"option":[194],"for":[195],"latency-sensitive":[196],"applications.":[197]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-18T00:00:00"}
