{"id":"https://openalex.org/W4414197279","doi":"https://doi.org/10.1109/dac63849.2025.11132883","title":"3D-TokSIM: Stacking 3D Memory with Token-Stationary Compute-in-Memory for Speculative LLM Inference","display_name":"3D-TokSIM: Stacking 3D Memory with Token-Stationary Compute-in-Memory for Speculative LLM Inference","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414197279","doi":"https://doi.org/10.1109/dac63849.2025.11132883"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072235841","display_name":"Wentao Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wentao Zhao","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085704421","display_name":"Bingfeng Lv","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Boya Lv","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Meng Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meng Wu","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102753527","display_name":"Peiyu Chen","orcid":"https://orcid.org/0000-0003-3043-3125"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peiyu Chen","raw_affiliation_strings":["Nano Core Chip Electronic Technology,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Nano Core Chip Electronic Technology,Hangzhou,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102362706","display_name":"Fengyun Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fengyun Yan","raw_affiliation_strings":["Nano Core Chip Electronic Technology,Hangzhou,China"],"affiliations":[{"raw_affiliation_string":"Nano Core Chip Electronic Technology,Hangzhou,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068840674","display_name":"Yufei Ma","orcid":"https://orcid.org/0000-0002-2670-524X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yufei Ma","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088551028","display_name":"Tianyu Jia","orcid":"https://orcid.org/0000-0002-4570-4613"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianyu Jia","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062886480","display_name":"Ru Huang","orcid":"https://orcid.org/0000-0002-8146-4821"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ru Huang","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003039083","display_name":"Le Ye","orcid":"https://orcid.org/0000-0003-0599-7762"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Le Ye","raw_affiliation_strings":["Peking University,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5072235841"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.7469,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.76997327,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10472","display_name":"Semiconductor materials and devices","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10472","display_name":"Semiconductor materials and devices","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12166","display_name":"Ion-surface interactions and analysis","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9886999726295471,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6359000205993652},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5598000288009644},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.4772999882698059},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4519999921321869},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.42809998989105225},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4253000020980835},{"id":"https://openalex.org/keywords/memory-architecture","display_name":"Memory architecture","score":0.4242999851703644},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.41269999742507935},{"id":"https://openalex.org/keywords/semiconductor-memory","display_name":"Semiconductor memory","score":0.40689998865127563},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.40450000762939453}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7976999878883362},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6359000205993652},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.4772999882698059},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4519999921321869},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.44830000400543213},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.42809998989105225},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4253000020980835},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.4242999851703644},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.41269999742507935},{"id":"https://openalex.org/C98986596","wikidata":"https://www.wikidata.org/wiki/Q1143031","display_name":"Semiconductor memory","level":2,"score":0.40689998865127563},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.40450000762939453},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.36169999837875366},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.3571999967098236},{"id":"https://openalex.org/C131017901","wikidata":"https://www.wikidata.org/wiki/Q170451","display_name":"Logic gate","level":2,"score":0.3343000113964081},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.32409998774528503},{"id":"https://openalex.org/C157922185","wikidata":"https://www.wikidata.org/wiki/Q173198","display_name":"Logic synthesis","level":3,"score":0.3100999891757965},{"id":"https://openalex.org/C93446704","wikidata":"https://www.wikidata.org/wiki/Q449328","display_name":"Registered memory","level":3,"score":0.3098999857902527},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30660000443458557},{"id":"https://openalex.org/C53838383","wikidata":"https://www.wikidata.org/wiki/Q541148","display_name":"Conventional memory","level":5,"score":0.3009999990463257},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C63511323","wikidata":"https://www.wikidata.org/wiki/Q908936","display_name":"Interleaved memory","level":4,"score":0.28940001130104065},{"id":"https://openalex.org/C2994168587","wikidata":"https://www.wikidata.org/wiki/Q5295","display_name":"Random access memory","level":2,"score":0.288100004196167},{"id":"https://openalex.org/C41036726","wikidata":"https://www.wikidata.org/wiki/Q844824","display_name":"Physical address","level":3,"score":0.28780001401901245},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2874000072479248},{"id":"https://openalex.org/C33347731","wikidata":"https://www.wikidata.org/wiki/Q285210","display_name":"Stacking","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2718000113964081},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2671999931335449},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2630000114440918},{"id":"https://openalex.org/C153247305","wikidata":"https://www.wikidata.org/wiki/Q835713","display_name":"Memory address","level":3,"score":0.2630000114440918},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C193969084","wikidata":"https://www.wikidata.org/wiki/Q7452500","display_name":"Sequential decoding","level":4,"score":0.2599000036716461},{"id":"https://openalex.org/C18131444","wikidata":"https://www.wikidata.org/wiki/Q163585","display_name":"Memory protection","level":5,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2057332538","https://openalex.org/W2791561716","https://openalex.org/W2976137532","https://openalex.org/W3136346557","https://openalex.org/W3213528054","https://openalex.org/W4220972538","https://openalex.org/W4221086307","https://openalex.org/W4385192563","https://openalex.org/W4386764151","https://openalex.org/W4387064011","https://openalex.org/W4393407316","https://openalex.org/W4401042997","https://openalex.org/W4401211878","https://openalex.org/W4402683901","https://openalex.org/W4404133606","https://openalex.org/W4404134065"],"related_works":[],"abstract_inverted_index":{"The":[0],"LLM":[1],"decoding":[2,101],"process":[3],"poses":[4],"a":[5,37,48],"significant":[6],"challenge":[7],"for":[8,98],"memory":[9,18,24,43,81],"bandwidth":[10],"due":[11,26],"to":[12,21,27,54,67,74,104],"its":[13],"autoregressive":[14],"nature.":[15],"Prior":[16],"2D":[17],"solutions":[19],"fail":[20],"overcome":[22],"this":[23,32],"bottleneck":[25],"limited":[28],"memory-to-logic":[29],"bandwidth.":[30],"In":[31],"work,":[33],"we":[34,89],"propose":[35],"3D-TokSIM,":[36],"cross-stack":[38],"solution":[39],"by":[40],"stacking":[41],"3D":[42],"on":[44,71,119],"logic":[45,72],"die":[46,73],"with":[47,63],"specially":[49],"designed":[50],"token-stationary":[51,65],"compute-in-memory":[52],"(CIM)":[53],"efficiently":[55],"accelerate":[56],"speculative":[57,120],"decoding.":[58,122],"Our":[59],"CIM":[60,96],"is":[61],"developed":[62],"novel":[64],"dataflow":[66],"reduce":[68,85],"data":[69],"movement":[70],"save":[75],"power":[76],"and":[77,80,93,113],"balance":[78],"computation":[79],"access.":[82],"To":[83],"further":[84],"the":[86],"buffer":[87],"requirements,":[88],"perform":[90],"architecture":[91],"exploration":[92],"allocate":[94],"notable":[95],"resources":[97],"achieving":[99],"higher":[100],"parallelism.":[102],"Compared":[103],"RTX":[105],"3090":[106],"GPU,":[107],"3D-TokSIM":[108],"achieves":[109],"15.1":[110],"$\\times$":[111],"throughput":[112],"$324":[114],"\\times$":[115],"energy":[116],"efficiency":[117],"improvements":[118],"Llama2-7B":[121]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2025-10-10T00:00:00"}
