{"id":"https://openalex.org/W7139919313","doi":"https://doi.org/10.48550/arxiv.2603.19172","title":"DyMoE: Dynamic Expert Orchestration with Mixed-Precision Quantization for Efficient MoE Inference on Edge","display_name":"DyMoE: Dynamic Expert Orchestration with Mixed-Precision Quantization for Efficient MoE Inference on Edge","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7139919313","doi":"https://doi.org/10.48550/arxiv.2603.19172"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.19172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.19172","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130244514","display_name":"Yuegui Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yuegui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065602294","display_name":"Zhiyuan Fang","orcid":"https://orcid.org/0000-0003-1186-5221"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Zhiyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130227164","display_name":"Weiqi Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Weiqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130214970","display_name":"Ruoyu Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Ruoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130222937","display_name":"Wuhui Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Wuhui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130225441","display_name":"Zibin Zheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.35260000824928284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.35260000824928284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.1396999955177307,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13553","display_name":"Age of Information Optimization","score":0.06159999966621399,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7671999931335449},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6888999938964844},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.5566999912261963},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.5253000259399414},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.5101000070571899},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4997999966144562},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.4918000102043152},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.4505000114440918},{"id":"https://openalex.org/keywords/prioritization","display_name":"Prioritization","score":0.4171999990940094}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7728999853134155},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7671999931335449},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6888999938964844},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.5566999912261963},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.5253000259399414},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4997999966144562},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49320000410079956},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.4918000102043152},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.4505000114440918},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4404999911785126},{"id":"https://openalex.org/C2777615720","wikidata":"https://www.wikidata.org/wiki/Q11888847","display_name":"Prioritization","level":2,"score":0.4171999990940094},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.4025999903678894},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.31029999256134033},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.299699991941452},{"id":"https://openalex.org/C2778456923","wikidata":"https://www.wikidata.org/wiki/Q5337692","display_name":"Edge computing","level":3,"score":0.28610000014305115},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.2838999927043915},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2743000090122223},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.27230000495910645},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2676999866962433},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.19172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.19172","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.19172","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,7],"computational":[2],"efficiency":[3],"of":[4],"MoE":[5,134],"models,":[6],"excessive":[8],"memory":[9],"footprint":[10],"and":[11,46,71,95,117],"I/O":[12,101],"overhead":[13],"inherent":[14],"in":[15,92,123],"multi-expert":[16],"architectures":[17],"pose":[18],"formidable":[19],"challenges":[20],"for":[21,61],"real-time":[22],"inference":[23,135],"on":[24,105,136],"resource-constrained":[25,137],"edge":[26,63,107,138],"platforms.":[27],"While":[28],"existing":[29],"static":[30],"methods":[31],"struggle":[32],"with":[33],"a":[34,55,120],"rigid":[35],"latency-accuracy":[36],"trade-off,":[37],"we":[38,52],"observe":[39],"that":[40,110],"expert":[41,68],"importance":[42,69],"is":[43],"highly":[44],"skewed":[45],"depth-dependent.":[47],"Motivated":[48],"by":[49,115],"these":[50],"insights,":[51],"propose":[53],"DyMoE,":[54],"dynamic":[56],"mixed-precision":[57],"quantization":[58],"framework":[59],"designed":[60],"high-performance":[62],"inference.":[64],"Leveraging":[65],"insights":[66],"into":[67],"skewness":[70],"depth-dependent":[72],"sensitivity,":[73],"DyMoE":[74,111],"introduces:":[75],"(1)":[76],"importance-aware":[77],"prioritization":[78],"to":[79,88,99,119,127],"dynamically":[80],"quantize":[81],"experts":[82],"at":[83],"runtime;":[84],"(2)":[85],"depth-adaptive":[86],"scheduling":[87],"preserve":[89],"semantic":[90],"integrity":[91],"critical":[93],"layers;":[94],"(3)":[96],"look-ahead":[97],"prefetching":[98],"overlap":[100],"stalls.":[102],"Experimental":[103],"results":[104],"commercial":[106],"hardware":[108],"show":[109],"reduces":[112],"Time-to-First-Token":[113],"(TTFT)":[114],"3.44x-22.7x":[116],"up":[118],"14.58x":[121],"speedup":[122],"Time-Per-Output-Token":[124],"(TPOT)":[125],"compared":[126],"state-of-the-art":[128],"offloading":[129],"baselines,":[130],"enabling":[131],"real-time,":[132],"accuracy-preserving":[133],"devices.":[139]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-21T00:00:00"}
