{"id":"https://openalex.org/W7161018561","doi":"https://doi.org/10.48550/arxiv.2605.11277","title":"Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models","display_name":"Sieve: Dynamic Expert-Aware PIM Acceleration for Evolving Mixture-of-Experts Models","publication_year":2026,"publication_date":"2026-05-11","ids":{"openalex":"https://openalex.org/W7161018561","doi":"https://doi.org/10.48550/arxiv.2605.11277"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.11277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.11277","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136017838","display_name":"Jungwoo Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Jungwoo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136084937","display_name":"Rubens Lacouture","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lacouture, Rubens","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136035482","display_name":"Genghan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Genghan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085031335","display_name":"Gina Sohn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sohn, Gina","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103250112","display_name":"Qizheng Zhang","orcid":"https://orcid.org/0009-0009-3208-4601"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qizheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136038507","display_name":"Swapnil Gandhi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gandhi, Swapnil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136041678","display_name":"Christos Kozyrakis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kozyrakis, Christos","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5023857198","display_name":"Kunle Olukotun","orcid":"https://orcid.org/0000-0002-8779-0636"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Olukotun, Kunle","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15649999678134918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.15649999678134918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.0957999974489212,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.062199998646974564,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5130000114440918},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4862000048160553},{"id":"https://openalex.org/keywords/interactivity","display_name":"Interactivity","score":0.42829999327659607},{"id":"https://openalex.org/keywords/concurrency","display_name":"Concurrency","score":0.41839998960494995},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.37209999561309814},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.34369999170303345},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.3294000029563904},{"id":"https://openalex.org/keywords/expert-system","display_name":"Expert system","score":0.301800012588501}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8284000158309937},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5130000114440918},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4862000048160553},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4706999957561493},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4512999951839447},{"id":"https://openalex.org/C144430266","wikidata":"https://www.wikidata.org/wiki/Q839721","display_name":"Interactivity","level":2,"score":0.42829999327659607},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.41839998960494995},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.37209999561309814},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.335999995470047},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.3294000029563904},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3095000088214874},{"id":"https://openalex.org/C58328972","wikidata":"https://www.wikidata.org/wiki/Q184609","display_name":"Expert system","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.29420000314712524},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2838999927043915},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.2718999981880188},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C98025372","wikidata":"https://www.wikidata.org/wiki/Q477538","display_name":"Systems architecture","level":3,"score":0.2655999958515167},{"id":"https://openalex.org/C206952183","wikidata":"https://www.wikidata.org/wiki/Q1193100","display_name":"Preemption","level":2,"score":0.25929999351501465}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.11277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.11277","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.11277","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture-of-Experts":[0],"(MoE)":[1],"has":[2],"become":[3],"a":[4,72,76,85,94,98,139,182],"dominant":[5],"architecture":[6],"for":[7,36,50,131,141,234],"scaling":[8],"large":[9],"language":[10],"models":[11,63,103,144],"(LLMs).":[12],"However,":[13],"the":[14,26,51,110,125,187],"execution":[15,156,191],"characteristics":[16],"of":[17,54,68,79,88,127],"MoE":[18,62,102,143],"inference":[19],"are":[20],"changing":[21],"rapidly":[22],"and":[23,57,118,159,175,194,205,207,240,245,250],"increasingly":[24,69,105],"mismatch":[25],"assumptions":[27],"underlying":[28],"existing":[29],"Processing-in-Memory":[30],"(PIM)":[31],"systems.":[32],"Prior":[33],"PIM":[34,129,160,176,203,232],"systems":[35,130,147,233],"LLMs":[37],"rely":[38],"on":[39,145,162,221,226,247],"static":[40],"rules":[41],"to":[42,46,189,230],"offload":[43],"memory-bound":[44],"operations":[45],"PIM,":[47],"without":[48],"accounting":[49],"combined":[52],"effects":[53],"load":[55],"imbalance":[56],"inter-GPU":[58],"communication.":[59],"Meanwhile,":[60],"modern":[61,101],"activate":[64],"fewer":[65],"experts":[66,80,89],"out":[67],"many,":[70],"creating":[71],"bimodal":[73,106],"expert":[74,155,216],"distribution:":[75],"small":[77],"set":[78],"receives":[81,90],"many":[82],"tokens,":[83],"while":[84,166,210],"long":[86],"tail":[87],"only":[91],"one":[92],"or":[93],"few.":[95],"We":[96],"identify":[97],"trend":[99],"in":[100,113],"toward":[104],"token-to-expert":[107,164],"distributions,":[108,165],"quantify":[109],"resulting":[111],"disparity":[112,122],"arithmetic":[114],"intensity":[115],"across":[116,192],"experts,":[117],"show":[119],"that":[120,185],"this":[121,135],"dramatically":[123],"reduces":[124],"efficiency":[126],"state-of-the-art":[128,231],"LLMs.":[132],"To":[133],"address":[134],"problem,":[136],"we":[137,179],"propose":[138,180],"scheduler":[140,153,188],"serving":[142],"multi-GPU":[146],"with":[148],"attached":[149,196],"HBM-PIM":[150,197],"stacks.":[151,198],"Our":[152],"partitions":[154],"between":[157],"GPU":[158,173,201],"based":[161,225],"runtime":[163,183],"jointly":[167],"considering":[168],"interconnect":[169],"overhead,":[170],"memory":[171],"bandwidth,":[172],"throughput,":[174],"throughput.":[177],"Moreover,":[178],"Sieve,":[181],"framework":[184],"employs":[186],"coordinate":[190],"GPUs":[193],"their":[195],"Sieve":[199,218,236],"overlaps":[200],"computation,":[202,204],"intra-":[206],"inter-device":[208],"communication":[209],"preserving":[211],"cross-device":[212],"dependencies":[213],"induced":[214],"by":[215,242],"parallelism.":[217],"is":[219],"evaluated":[220],"our":[222],"cycle-accurate":[223],"simulator":[224],"Ramulator":[227],"2.0.":[228],"Compared":[229],"MoE,":[235],"improves":[237],"both":[238],"throughput":[239],"interactivity":[241],"1.3x,":[243,244],"1.6x":[246],"Qwen3.5-397B-A17B,":[248],"GPT-OSS-120B,":[249],"Qwen3-30B-A3B,":[251],"respectively.":[252]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-14T00:00:00"}
