{"id":"https://openalex.org/W7140195344","doi":"https://doi.org/10.48550/arxiv.2603.21862","title":"Holistic Scaling Laws for Optimal Mixture-of-Experts Architecture Optimization","display_name":"Holistic Scaling Laws for Optimal Mixture-of-Experts Architecture Optimization","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140195344","doi":"https://doi.org/10.48550/arxiv.2603.21862"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21862","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21862","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21862","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wan, Weilin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Weilin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Han, Jingtao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Jingtao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Weizhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Weizhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Jin, Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.5616999864578247,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.5616999864578247,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.05350000038743019,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10848","display_name":"Advanced Multi-Objective Optimization Algorithms","score":0.026000000536441803,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.6132000088691711},{"id":"https://openalex.org/keywords/flexibility","display_name":"Flexibility (engineering)","score":0.5627999901771545},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5461000204086304},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5370000004768372},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5185999870300293},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.5054000020027161},{"id":"https://openalex.org/keywords/scaling-law","display_name":"Scaling law","score":0.4453999996185303},{"id":"https://openalex.org/keywords/minification","display_name":"Minification","score":0.4366999864578247},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.4327000081539154}],"concepts":[{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.6132000088691711},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.5627999901771545},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5461000204086304},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.5435000061988831},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5370000004768372},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5340999960899353},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5185999870300293},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.5054000020027161},{"id":"https://openalex.org/C2988430800","wikidata":"https://www.wikidata.org/wiki/Q428971","display_name":"Scaling law","level":3,"score":0.4453999996185303},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.4366999864578247},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.4327000081539154},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.4083000123500824},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3497999906539917},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.3465000092983246},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C2779585090","wikidata":"https://www.wikidata.org/wiki/Q3457762","display_name":"Resilience (materials science)","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C55660270","wikidata":"https://www.wikidata.org/wiki/Q5164377","display_name":"Constrained optimization","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.28630000352859497},{"id":"https://openalex.org/C71559656","wikidata":"https://www.wikidata.org/wiki/Q671298","display_name":"Divide and conquer algorithms","level":2,"score":0.28529998660087585},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.28360000252723694},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2766999900341034},{"id":"https://openalex.org/C89109886","wikidata":"https://www.wikidata.org/wiki/Q1535924","display_name":"Trust region","level":3,"score":0.2728999853134155},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.262800008058548},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.25850000977516174},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.2547000050544739},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21862","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21862","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21862","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21862","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.6183607578277588,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Scaling":[0],"laws":[1,160],"for":[2,64,86],"Large":[3],"Language":[4],"Models":[5],"govern":[6],"macroscopic":[7],"resource":[8],"allocation,":[9],"yet":[10],"translating":[11],"them":[12],"into":[13],"precise":[14],"Mixture-of-Experts":[15],"(MoE)":[16],"architectural":[17,67,123],"configurations":[18],"remains":[19],"an":[20,82],"open":[21],"problem":[22],"due":[23],"to":[24,39,126,166,188],"the":[25,121,139,177],"combinatorially":[26],"vast":[27],"design":[28],"space.":[29],"Existing":[30],"MoE":[31,46,66,87,146,170],"scaling":[32,42,159,190],"studies":[33],"are":[34],"constrained":[35],"by":[36],"experimental":[37],"budgets":[38],"either":[40],"augment":[41],"formulas":[43],"with":[44,182],"extra":[45],"variables,":[47],"risking":[48],"unreliable":[49],"fits,":[50],"or":[51],"fix":[52],"all":[53],"non-MoE":[54],"factors,":[55],"ignoring":[56],"global":[57],"interactions.":[58],"We":[59,73,118],"propose":[60],"a":[61,105,135,167],"reusable":[62],"framework":[63,156],"holistic":[65],"optimization":[68],"that":[69,76,161,176],"bridges":[70],"this":[71],"gap.":[72],"first":[74],"show":[75],"FLOPs":[77,110],"per":[78,111],"token":[79],"alone":[80],"is":[81,175],"inadequate":[83],"fairness":[84],"metric":[85],"models":[88,147],"because":[89],"differing":[90],"computational":[91],"densities":[92],"across":[93,143],"layer":[94],"types":[95],"can":[96],"inflate":[97],"parameters":[98],"without":[99],"proportional":[100],"compute":[101,164],"cost,":[102],"and":[103,115,134],"establish":[104],"joint":[106],"constraint":[107],"triad":[108],"of":[109,138,145,151],"token,":[112],"active":[113],"parameters,":[114],"total":[116],"parameters.":[117],"then":[119],"reduce":[120],"16-dimensional":[122],"search":[124],"space":[125],"two":[127],"sequential":[128],"low-dimensional":[129],"phases":[130],"through":[131],"algebraic":[132],"constraints":[133],"rank-preserving":[136],"property":[137],"hidden":[140],"dimension.":[141],"Validated":[142],"hundreds":[144],"spanning":[148],"six":[149],"orders":[150],"magnitude":[152],"in":[153],"compute,":[154],"our":[155],"yields":[157],"robust":[158],"map":[162],"any":[163],"budget":[165],"complete,":[168],"optimal":[169],"architecture.":[171],"A":[172],"key":[173],"finding":[174],"near-optimal":[178],"configuration":[179],"band":[180],"widens":[181],"scale,":[183],"giving":[184],"practitioners":[185],"quantitative":[186],"flexibility":[187],"balance":[189],"law":[191],"recommendations":[192],"against":[193],"infrastructure":[194],"constraints.":[195]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-25T00:00:00"}
