{"id":"https://openalex.org/W4416429769","doi":"https://doi.org/10.1109/iccad66269.2025.11240805","title":"Tasa: Thermal-aware 3D-Stacked Architecture Design with Bandwidth Sharing for LLM Inference","display_name":"Tasa: Thermal-aware 3D-Stacked Architecture Design with Bandwidth Sharing for LLM Inference","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W4416429769","doi":"https://doi.org/10.1109/iccad66269.2025.11240805"},"language":null,"primary_location":{"id":"doi:10.1109/iccad66269.2025.11240805","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240805","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101930482","display_name":"Siyuan He","orcid":"https://orcid.org/0000-0001-8992-9550"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Siyuan He","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110267455","display_name":"Peiran Yan","orcid":null},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peiran Yan","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081603207","display_name":"Yandong He","orcid":"https://orcid.org/0000-0003-3465-4718"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yandong He","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051409603","display_name":"Youwei Zhuo","orcid":"https://orcid.org/0000-0002-1557-2613"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Youwei Zhuo","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088551028","display_name":"Tianyu Jia","orcid":"https://orcid.org/0000-0002-4570-4613"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianyu Jia","raw_affiliation_strings":["Peking University,School of Integrated Circuits,Beijing,China"],"affiliations":[{"raw_affiliation_string":"Peking University,School of Integrated Circuits,Beijing,China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101930482"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41389568,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.29989999532699585,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.29989999532699585,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.21860000491142273,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.04569999873638153,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6586999893188477},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5303999781608582},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4772000014781952},{"id":"https://openalex.org/keywords/dram","display_name":"Dram","score":0.47519999742507935},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.45899999141693115},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.45489999651908875},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.45249998569488525},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.4352000057697296}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7200999855995178},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6586999893188477},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5303999781608582},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4772000014781952},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.47519999742507935},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.45899999141693115},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.45489999651908875},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.45249998569488525},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.446399986743927},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.4352000057697296},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.43479999899864197},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4341999888420105},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4041999876499176},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39010000228881836},{"id":"https://openalex.org/C204530211","wikidata":"https://www.wikidata.org/wiki/Q752823","display_name":"Thermal","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C3020431745","wikidata":"https://www.wikidata.org/wiki/Q25325220","display_name":"Many core","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.2980000078678131},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.27630001306533813},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.27619999647140503},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.25679999589920044},{"id":"https://openalex.org/C178693496","wikidata":"https://www.wikidata.org/wiki/Q911691","display_name":"Clock rate","level":3,"score":0.2551000118255615},{"id":"https://openalex.org/C2984335091","wikidata":"https://www.wikidata.org/wiki/Q11388","display_name":"Thermal infrared","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad66269.2025.11240805","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240805","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2048935416","https://openalex.org/W2346205343","https://openalex.org/W2605347906","https://openalex.org/W2884361556","https://openalex.org/W2940862705","https://openalex.org/W2956705340","https://openalex.org/W2998732502","https://openalex.org/W3093933627","https://openalex.org/W3134121265","https://openalex.org/W3136346557","https://openalex.org/W3189166979","https://openalex.org/W4200550987","https://openalex.org/W4220702013","https://openalex.org/W4220972538","https://openalex.org/W4221001402","https://openalex.org/W4226126604","https://openalex.org/W4285103020","https://openalex.org/W4385192563","https://openalex.org/W4385245566","https://openalex.org/W4387064011","https://openalex.org/W4387869276","https://openalex.org/W4388757726","https://openalex.org/W4392427708","https://openalex.org/W4394998968","https://openalex.org/W4395106409","https://openalex.org/W4399389540","https://openalex.org/W4401211878","https://openalex.org/W4401367577","https://openalex.org/W4402715152","https://openalex.org/W4404955001"],"related_works":[],"abstract_inverted_index":{"The":[0],"autoregressive":[1],"decoding":[2],"in":[3,57,126],"LLMs":[4],"is":[5,21,98,106],"the":[6,12,46,68,85,90,93,123,143,184],"major":[7],"inference":[8],"bottleneck":[9],"due":[10],"to":[11,54,83,121,149],"memory-intensive":[13,109],"operations":[14],"and":[15,63,88,154,163,171,178,187],"limited":[16],"hardware":[17],"bandwidth.":[18],"3D-stacked":[19,47,71,145],"architecture":[20,48,78,137],"a":[22,76,117],"promising":[23],"solution":[24],"with":[25,79,142],"significantly":[26],"improved":[27],"memory":[28],"bandwidth,":[29],"which":[30],"vertically":[31],"stacked":[32],"multi":[33],"DRAM":[34],"dies":[35],"on":[36],"top":[37],"of":[38,59,70],"logic":[39],"die.":[40],"However,":[41],"our":[42,135],"experiments":[43,132],"also":[44,175],"show":[45,133],"faces":[49],"severer":[50],"thermal":[51,60,81,94,131],"issues":[52],"compared":[53,141],"2D":[55],"architecture,":[56,72,146],"terms":[58],"temperature,":[61],"gradient":[62],"scalability.":[64],"To":[65],"better":[66],"exploit":[67],"potential":[69],"we":[73,115],"present":[74],"Tasa,":[75],"heterogeneous":[77,128,189],"cross-stack":[80],"optimizations":[82],"balance":[84],"temperature":[86,158],"distribution":[87],"maximize":[89],"performance":[91],"under":[92],"constraints.":[95],"High-performance":[96],"core":[97,105,165],"designed":[99],"for":[100,108,160,169],"compute-intensive":[101],"operations,":[102],"while":[103],"high-efficiency":[104],"used":[107],"operators,":[110],"e.g.":[111],"attention":[112],"layers.":[113],"Furthermore,":[114],"propose":[116],"bandwidth":[118,124],"sharing":[119],"scheduling":[120],"improve":[122],"utilization":[125],"such":[127],"architecture.":[129],"Extensive":[130],"that":[134],"Tasa":[136],"demonstrates":[138],"greater":[139],"scalability":[140],"homogeneous":[144],"i.e.":[147],"up":[148],"5.55":[150],"\u00b0C,":[151,153],"9.37":[152],"7.91":[155],"\u00b0C":[156],"peak":[157],"reduction":[159],"48,":[161],"60,":[162],"72":[164],"configurations.":[166],"Our":[167],"experimental":[168],"Llama-65B":[170],"GPT-3":[172],"66B":[173],"inferences":[174],"demonstrate":[176],"2.85\u00d7":[177],"2.21\u00d7":[179],"speedup":[180],"are":[181],"obtained":[182],"over":[183],"GPU":[185],"baselines":[186],"state-of-the-art":[188],"PIM-based":[190],"LLM":[191],"accelerator.":[192]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-20T00:00:00"}
