{"id":"https://openalex.org/W7151281843","doi":"https://doi.org/10.48550/arxiv.2604.04253","title":"Rethinking Compute Substrates for 3D-Stacked Near-Memory LLM Decoding: Microarchitecture-Scheduling Co-Design","display_name":"Rethinking Compute Substrates for 3D-Stacked Near-Memory LLM Decoding: Microarchitecture-Scheduling Co-Design","publication_year":2026,"publication_date":"2026-04-05","ids":{"openalex":"https://openalex.org/W7151281843","doi":"https://doi.org/10.48550/arxiv.2604.04253"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04253","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04253","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04253","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5097277847","display_name":"Chenyang Ai","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ai, Chenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072774003","display_name":"Yuxin Zhang","orcid":"https://orcid.org/0009-0007-5232-4006"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yixing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133139695","display_name":"Haoran Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Haoran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133065218","display_name":"Yudong Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Yudong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133114052","display_name":"Lechuan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Lechuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133071591","display_name":"Wenhui OU","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"OU, Wenhui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5097277847"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4580000042915344,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4580000042915344,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.11240000277757645,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.10559999942779541,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/systolic-array","display_name":"Systolic array","score":0.6818000078201294},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6410999894142151},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5467000007629395},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.5327000021934509},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.49079999327659607},{"id":"https://openalex.org/keywords/reconfigurability","display_name":"Reconfigurability","score":0.4684999883174896},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.46209999918937683},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.4553000032901764},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.44609999656677246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.779699981212616},{"id":"https://openalex.org/C150741067","wikidata":"https://www.wikidata.org/wiki/Q2377218","display_name":"Systolic array","level":3,"score":0.6818000078201294},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6410999894142151},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5467000007629395},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.5327000021934509},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5235000252723694},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.49079999327659607},{"id":"https://openalex.org/C2780149590","wikidata":"https://www.wikidata.org/wiki/Q7302742","display_name":"Reconfigurability","level":2,"score":0.4684999883174896},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.46209999918937683},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.4553000032901764},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.44609999656677246},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.43970000743865967},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.33799999952316284},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.32339999079704285},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3206999897956848},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.32010000944137573},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.31929999589920044},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3188999891281128},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C165005293","wikidata":"https://www.wikidata.org/wiki/Q1074500","display_name":"Chip","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C2776350369","wikidata":"https://www.wikidata.org/wiki/Q843479","display_name":"Control logic","level":2,"score":0.30140000581741333},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C157922185","wikidata":"https://www.wikidata.org/wiki/Q173198","display_name":"Logic synthesis","level":3,"score":0.2833999991416931},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.27790001034736633},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C152890283","wikidata":"https://www.wikidata.org/wiki/Q4129922","display_name":"Computing with Memory","level":5,"score":0.2667999863624573},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04253","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04253","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04253","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04253","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.90059894323349,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"decoding":[4],"is":[5],"a":[6,38,83,106,196],"major":[7],"inference":[8],"bottleneck":[9],"because":[10],"its":[11],"low":[12,224],"arithmetic":[13],"intensity":[14],"makes":[15],"performance":[16],"highly":[17,197],"sensitive":[18],"to":[19,142,166,190,211,218],"memory":[20,30,150],"bandwidth.":[21],"3D-stacked":[22,58,94,212],"near-memory":[23],"processing":[24],"(NMP)":[25],"provides":[26,172],"substantially":[27],"higher":[28,251],"local":[29,149],"bandwidth":[31,50,151],"than":[32],"conventional":[33],"off-chip":[34],"interfaces,":[35],"making":[36,122],"it":[37],"promising":[39],"substrate":[40,79],"for":[41,132,155,182,185],"decode":[42,55,116],"acceleration.":[43],"However,":[44],"our":[45,242],"analysis":[46],"shows":[47],"that":[48,115],"this":[49,138],"advantage":[51],"also":[52],"shifts":[53],"many":[54],"operators":[56,117],"on":[57,137,201],"NMP":[59,95,213],"back":[60],"into":[61],"the":[62,66,71,74,77,89,147,153,160,175,192,206,220],"compute-bound":[63],"regime.":[64],"Under":[65],"tight":[67],"area":[68,225],"budget":[69],"of":[70,76,92,174,223],"logic":[72,177],"die,":[73],"design":[75,243],"compute":[78,90,103,208],"itself":[80],"therefore":[81],"becomes":[82],"first-order":[84],"challenge.":[85],"Therefore,":[86],"we":[87,98,112,140,204],"rethink":[88],"microarchitecture":[91,209],"prior":[93,100],"designs.":[96],"First,":[97],"replace":[99],"MAC":[101],"tree-based":[102],"units":[104],"with":[105,240],"more":[107],"area-efficient":[108,198],"systolic":[109,126,186],"array,":[110,187],"and":[111,129,159,178,229,249,257],"further":[113,233],"observe":[114],"exhibit":[118],"substantial":[119],"shape":[120,128],"diversity,":[121],"reconfigurability":[123],"in":[124,195],"both":[125,255],"array":[127],"dataflow":[130],"essential":[131],"sustaining":[133],"high":[134,148],"utilization.":[135],"Building":[136],"insight,":[139],"continue":[141],"exploit":[143],"two":[144,193],"key":[145],"opportunities:":[146],"reduces":[152],"need":[154],"large":[156],"on-chip":[157],"buffers,":[158],"existing":[161],"vector":[162],"core,":[163],"originally":[164],"designed":[165,217],"handle":[167],"auxiliary":[168],"tensor":[169],"computations,":[170],"already":[171],"much":[173],"control":[176],"multi-ported":[179],"buffering":[180],"required":[181],"fine-grained":[183,230],"flexibility":[184],"allowing":[188],"us":[189],"unify":[191],"structures":[194],"manner.":[199],"Based":[200],"these":[202],"insights,":[203],"present":[205],"first":[207],"tailored":[210],"LLM":[214],"decoding,":[215],"explicitly":[216],"satisfy":[219],"joint":[221],"requirements":[222],"cost,":[226],"high-bandwidth":[227],"operation,":[228],"reconfigurability.":[231],"We":[232],"propose":[234],"an":[235,245],"multi-core":[236],"scheduling":[237],"framework.":[238],"Compared":[239],"Stratum,":[241],"achieves":[244],"average":[246],"2.91x":[247],"speedup":[248],"2.40x":[250],"energy":[252],"efficiency":[253],"across":[254],"dense":[256],"MoE":[258],"models.":[259]},"counts_by_year":[],"updated_date":"2026-04-11T06:13:24.991567","created_date":"2026-04-08T00:00:00"}
