{"id":"https://openalex.org/W4417034317","doi":"https://doi.org/10.48550/arxiv.2512.03927","title":"OD-MoE: On-Demand Expert Loading for Cacheless Edge-Distributed MoE Inference","display_name":"OD-MoE: On-Demand Expert Loading for Cacheless Edge-Distributed MoE Inference","publication_year":2025,"publication_date":"2025-12-03","ids":{"openalex":"https://openalex.org/W4417034317","doi":"https://doi.org/10.48550/arxiv.2512.03927"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2512.03927","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.03927","pdf_url":"https://arxiv.org/pdf/2512.03927","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.03927","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085737501","display_name":"Liujianfu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Liujianfu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023258501","display_name":"Yiping P. Du","orcid":"https://orcid.org/0000-0002-8326-7901"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Yuyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100720364","display_name":"Yuchen Pan","orcid":"https://orcid.org/0000-0001-8395-1271"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Yuchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019164720","display_name":"Soung Chang Liew","orcid":"https://orcid.org/0000-0001-7055-6483"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liew, Soung Chang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100656308","display_name":"Jiacheng Liu","orcid":"https://orcid.org/0009-0003-7010-4091"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiacheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100627368","display_name":"Kexin Chen","orcid":"https://orcid.org/0000-0003-4248-7681"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kexin","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5085737501"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.3240000009536743,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.3240000009536743,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.17440000176429749,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.09640000015497208,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.7378000020980835},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7107999920845032},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.6784999966621399},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6460000276565552},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.5859000086784363},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.5655999779701233},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5546000003814697},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.5422000288963318}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8562999963760376},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.7378000020980835},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7107999920845032},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.6784999966621399},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6460000276565552},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.5859000086784363},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.5655999779701233},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5546000003814697},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.5422000288963318},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.5385000109672546},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4223000109195709},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.41990000009536743},{"id":"https://openalex.org/C2778456923","wikidata":"https://www.wikidata.org/wiki/Q5337692","display_name":"Edge computing","level":3,"score":0.4131999909877777},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3553999960422516},{"id":"https://openalex.org/C102600418","wikidata":"https://www.wikidata.org/wiki/Q6517507","display_name":"Legal expert system","level":3,"score":0.3303000032901764},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C157170001","wikidata":"https://www.wikidata.org/wiki/Q4781507","display_name":"Applications of artificial intelligence","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.28380000591278076},{"id":"https://openalex.org/C2777472644","wikidata":"https://www.wikidata.org/wiki/Q16968992","display_name":"Approximate inference","level":3,"score":0.2809000015258789},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2718000113964081},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.26440000534057617},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2563000023365021},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2524999976158142}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2512.03927","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.03927","pdf_url":"https://arxiv.org/pdf/2512.03927","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2512.03927","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.03927","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.03927","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.03927","pdf_url":"https://arxiv.org/pdf/2512.03927","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4417034317.pdf","grobid_xml":"https://content.openalex.org/works/W4417034317.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE),":[1],"while":[2,126,207],"offering":[3],"significant":[4],"advantages":[5],"as":[6],"a":[7,40,78,171,202],"Large":[8],"Language":[9],"Model":[10],"(LLM)":[11],"architecture,":[12],"faces":[13],"substantial":[14],"challenges":[15],"when":[16],"deployed":[17],"on":[18,170,228],"low-cost":[19,243],"edge":[20,111,229,248],"devices":[21,245],"with":[22,71,231],"tight":[23],"memory":[24,37,53,63,157],"constraints.":[25],"Expert":[26],"offloading":[27,168],"mitigates":[28],"this":[29,49],"issue":[30],"by":[31,55,217],"storing":[32],"expert":[33,66,88,93,104,107,121,127,139,182,222],"parameters":[34],"in":[35,45,249],"CPU":[36],"and":[38,106,113,150,191],"caching":[39,56,67],"subset":[41],"of":[42,142,197,201,211,242],"popular":[43],"experts":[44],"GPU":[46,52,62,156,213,233],"memory.":[47,214],"Although":[48],"approach":[50],"improves":[51],"utilization":[54],"only":[57,209],"the":[58,61,85,143,198,212,219,236,247,250],"likely-used":[59],"experts,":[60],"reserved":[64],"for":[65,87,158,221,238],"is":[68,96,129],"underutilized":[69],"compared":[70],"dense":[72],"LLMs.":[73],"This":[74],"paper":[75],"presents":[76],"OD-MoE,":[77],"distributed":[79,110,144],"MoE":[80,167,205,226,240],"inference":[81,227],"framework":[82],"that":[83,119],"obviates":[84],"need":[86,220],"caches":[89],"via":[90],"fully":[91,203],"on-demand":[92],"loading.":[94],"OD-MoE":[95,134,164,179,193,224],"built":[97],"upon":[98],"two":[99],"key":[100],"mechanisms:":[101],"1)":[102,178],"parallelizing":[103],"loading":[105],"computation":[108,128],"across":[109],"nodes,":[112],"2)":[114,192],"an":[115],"ultra-accurate":[116],"emulative":[117],"predictor":[118],"forecasts":[120],"activations":[122],"multiple":[123],"layers":[124],"ahead":[125],"ongoing.":[130],"With":[131],"these":[132],"innovations,":[133],"dynamically":[135],"loads":[136],"each":[137],"target":[138],"to":[140],"one":[141],"nodes":[145,230],"just-in-time":[146],"before":[147],"its":[148],"activation":[149,183],"promptly":[151],"evicts":[152],"it":[153],"afterward,":[154],"freeing":[155],"subsequent":[159],"experts.":[160],"We":[161],"comprehensively":[162],"benchmark":[163],"against":[165],"state-of-the-art":[166],"systems":[169],"ten-node":[172],"testbed.":[173],"Experimental":[174],"results":[175],"show":[176],"that:":[177],"achieves":[180],"99.94%":[181],"prediction":[184],"accuracy,":[185],"substantially":[186],"surpassing":[187],"all":[188],"existing":[189],"methods;":[190],"delivers":[194],"approximately":[195],"75%":[196],"decoding":[199],"speed":[200],"GPU-cached":[204],"deployment":[206,241],"using":[208],"1/3":[210],"More":[215],"importantly,":[216],"eliminating":[218],"caches,":[223],"enables":[225],"less-than-1GB":[232],"memory,":[234],"paving":[235],"way":[237],"practical":[239],"IoT":[244],"at":[246],"LLM":[251],"era.":[252]},"counts_by_year":[],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-12-05T00:00:00"}
