{"id":"https://openalex.org/W7151595904","doi":"https://doi.org/10.48550/arxiv.2604.04750","title":"DeepStack: Scalable and Accurate Design Space Exploration for Distributed 3D-Stacked AI Accelerators","display_name":"DeepStack: Scalable and Accurate Design Space Exploration for Distributed 3D-Stacked AI Accelerators","publication_year":2026,"publication_date":"2026-04-06","ids":{"openalex":"https://openalex.org/W7151595904","doi":"https://doi.org/10.48550/arxiv.2604.04750"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04750","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125967418","display_name":"Zhiwen Mo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mo, Zhiwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100708853","display_name":"Guoyu Li","orcid":"https://orcid.org/0009-0008-5914-323X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guoyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Hao Mark","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hao Mark","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133126239","display_name":"Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tang, Zhengju","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Zhengju","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Qianzhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qianzhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Lei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liang, Shuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Shuang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133124318","display_name":"Shuang Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Lingxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133147374","display_name":"Lingxiao Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Xianqi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Guo, Yuxiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yuxiao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Luk, Wayne","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luk, Wayne","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xue, Jilong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Jilong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Fan, Hongxiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Hongxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6625000238418579,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6625000238418579,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11527","display_name":"3D IC and TSV technologies","score":0.13420000672340393,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.0502999983727932,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6553000211715698},{"id":"https://openalex.org/keywords/design-space-exploration","display_name":"Design space exploration","score":0.6510000228881836},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5328999757766724},{"id":"https://openalex.org/keywords/dram","display_name":"Dram","score":0.5070000290870667},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.48899999260902405},{"id":"https://openalex.org/keywords/distributed-memory","display_name":"Distributed memory","score":0.43619999289512634},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.361299991607666},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.36039999127388},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.357699990272522}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8217999935150146},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6553000211715698},{"id":"https://openalex.org/C2776221188","wikidata":"https://www.wikidata.org/wiki/Q21072556","display_name":"Design space exploration","level":2,"score":0.6510000228881836},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5328999757766724},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.5070000290870667},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4921000003814697},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.48899999260902405},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.48899999260902405},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.43619999289512634},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4226999878883362},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.361299991607666},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.36039999127388},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.357699990272522},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.3560999929904938},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.334199994802475},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3294999897480011},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.3276999890804291},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C144240696","wikidata":"https://www.wikidata.org/wiki/Q367204","display_name":"Address space","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.2937999963760376},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2818000018596649},{"id":"https://openalex.org/C147358964","wikidata":"https://www.wikidata.org/wiki/Q1200992","display_name":"Abstraction layer","level":3,"score":0.27950000762939453},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C200833197","wikidata":"https://www.wikidata.org/wiki/Q333707","display_name":"Compile time","level":3,"score":0.27709999680519104},{"id":"https://openalex.org/C104060986","wikidata":"https://www.wikidata.org/wiki/Q180046","display_name":"Space exploration","level":2,"score":0.266400009393692}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04750","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04750","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.5186321139335632}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Advances":[0],"in":[1,10],"hybrid":[2],"bonding":[3],"and":[4,18,49,53,87,99,115,141,171,188,209,213],"packaging":[5],"have":[6],"driven":[7],"growing":[8],"interest":[9],"3D":[11,35,75,136,189],"DRAM-stacked":[12],"accelerators":[13],"with":[14,175],"higher":[15,183],"memory":[16,76],"bandwidth":[17],"capacity.":[19],"As":[20],"LLMs":[21],"scale":[22],"to":[23,55,122,181,224,234,238],"hundreds":[24],"of":[25,29],"billions":[26],"or":[27],"trillions":[28],"parameters,":[30],"distributed":[31,64,103,172],"inference":[32],"across":[33],"multiple":[34],"chips":[36],"becomes":[37],"essential.":[38],"With":[39,106,148],"cross-stack":[40],"co-design":[41,59],"increasingly":[42],"critical,":[43],"we":[44,119],"propose":[45],"DeepStack,":[46],"an":[47],"accurate":[48],"efficient":[50,155],"performance":[51],"model":[52],"tool":[54],"enable":[56],"early-stage":[57],"system-hardware":[58],"space":[60,151],"exploration":[61,156],"(DSE)":[62],"for":[63,102],"3D-stacked":[65,162],"AI":[66],"systems.":[67],"At":[68,90],"the":[69,91,206],"hardware":[70,214],"level,":[71,93],"DeepStack":[72,94,153,178,237],"captures":[73],"fine-grained":[74],"semantics":[77],"such":[78,110],"as":[79,111],"transaction-aware":[80],"bandwidth,":[81],"bank":[82],"activation":[83],"constraints,":[84],"buffering":[85],"limitations,":[86],"thermal-power":[88],"modeling.":[89],"system":[92],"incorporates":[95],"comprehensive":[96],"parallelization":[97],"strategies":[98],"execution":[100],"scheduling":[101],"LLM":[104],"inference.":[105],"novel":[107],"modeling":[108],"techniques":[109],"dual-stage":[112],"network":[113],"abstraction":[114],"tile-level":[116],"compute-communication":[117],"overlap,":[118],"achieve":[120],"up":[121,180],"100,000x":[123],"faster":[124],"runtime":[125],"over":[126,157],"state-of-the-art":[127],"simulators":[128],"at":[129],"comparable":[130],"accuracy,":[131],"cross-validated":[132],"against":[133],"our":[134],"in-house":[135],"designs,":[137,177],"NS-3":[138],"backend":[139],"(2.12%),":[140],"vLLM":[142],"serving":[143],"on":[144],"8xB200":[145],"GPUs":[146],"(12.18%).":[147],"hierarchical":[149],"design":[150,159],"search,":[152],"enables":[154],"2.5x10^14":[158],"points":[160],"spanning":[161],"DRAM":[163,165],"layers,":[164],"vertical":[166],"connectivity,":[167],"interconnect,":[168],"compute-memory":[169],"allocation,":[170],"scheduling.":[173],"Compared":[174],"baseline":[176],"achieves":[179],"9.5x":[182],"throughput":[184],"through":[185],"co-optimized":[186],"parallelism":[187,211],"architecture":[190,215],"search.":[191],"Our":[192],"DSE":[193],"further":[194],"reveals":[195],"that":[196,210],"batch":[197],"size":[198],"drives":[199],"a":[200],"more":[201],"fundamental":[202],"architectural":[203],"divide":[204],"than":[205],"prefill/decode":[207],"distinction,":[208],"strategy":[212],"are":[216],"tightly":[217],"coupled":[218],"--":[219],"incomplete":[220],"schedule":[221],"search":[222],"leads":[223],"permanently":[225],"suboptimal":[226],"silicon":[227],"irrecoverable":[228],"by":[229],"software":[230],"tuning.":[231],"We":[232],"intend":[233],"open":[235],"source":[236],"support":[239],"future":[240],"research.":[241]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-08T00:00:00"}
