{"id":"https://openalex.org/W4416429879","doi":"https://doi.org/10.1109/iccad66269.2025.11240702","title":"H3D-LLM: Heterogeneous 3D Chiplet Design for LLM Inference with Dynamic Task Scheduling and Memory-Aware Orchestration","display_name":"H3D-LLM: Heterogeneous 3D Chiplet Design for LLM Inference with Dynamic Task Scheduling and Memory-Aware Orchestration","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W4416429879","doi":"https://doi.org/10.1109/iccad66269.2025.11240702"},"language":null,"primary_location":{"id":"doi:10.1109/iccad66269.2025.11240702","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240702","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102674132","display_name":"Hui Kou","orcid":"https://orcid.org/0009-0007-4874-0867"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hui Kou","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000982571","display_name":"Chenjie Xia","orcid":"https://orcid.org/0009-0005-2203-4011"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chenjie Xia","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082047323","display_name":"Jialin Yang","orcid":"https://orcid.org/0000-0001-6767-9028"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialin Yang","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100767886","display_name":"Liyi Li","orcid":"https://orcid.org/0000-0003-2204-228X"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liyi Li","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079498326","display_name":"Hao Cai","orcid":"https://orcid.org/0000-0001-9794-8049"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Cai","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000787204","display_name":"Xin Si","orcid":"https://orcid.org/0000-0002-4993-0087"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Si","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100461621","display_name":"Bo Liu","orcid":"https://orcid.org/0000-0002-0894-1054"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Liu","raw_affiliation_strings":["Southeast University,School of Integrated Circuits,Nanjing,China"],"affiliations":[{"raw_affiliation_string":"Southeast University,School of Integrated Circuits,Nanjing,China","institution_ids":["https://openalex.org/I76569877"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5102674132"],"corresponding_institution_ids":["https://openalex.org/I76569877"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38489357,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4657999873161316,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4657999873161316,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.12399999797344208,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.06809999793767929,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5633999705314636},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.545799970626831},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5335999727249146},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.5242000222206116},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.45899999141693115},{"id":"https://openalex.org/keywords/dynamic-priority-scheduling","display_name":"Dynamic priority scheduling","score":0.39969998598098755},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.39329999685287476},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.3831000030040741},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.3483999967575073}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8452000021934509},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5633999705314636},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.545799970626831},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5335999727249146},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.5242000222206116},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4984999895095825},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4758000075817108},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.45899999141693115},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.39969998598098755},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.39329999685287476},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.37059998512268066},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.36970001459121704},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3483999967575073},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3190000057220459},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.30790001153945923},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.3018999993801117},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.298799991607666},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.2962000072002411},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2892000079154968},{"id":"https://openalex.org/C3720319","wikidata":"https://www.wikidata.org/wiki/Q5015937","display_name":"Cache-only memory architecture","level":5,"score":0.27880001068115234},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C2777958785","wikidata":"https://www.wikidata.org/wiki/Q17120940","display_name":"Resource efficiency","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.25380000472068787},{"id":"https://openalex.org/C118021083","wikidata":"https://www.wikidata.org/wiki/Q610398","display_name":"System on a chip","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C160403385","wikidata":"https://www.wikidata.org/wiki/Q220543","display_name":"Queue","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad66269.2025.11240702","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240702","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2105102111","https://openalex.org/W2118231264","https://openalex.org/W2155707315","https://openalex.org/W2587180584","https://openalex.org/W3159727696","https://openalex.org/W3189166979","https://openalex.org/W4223424212","https://openalex.org/W4252118812","https://openalex.org/W4280496502","https://openalex.org/W4312917873","https://openalex.org/W4360606196","https://openalex.org/W4375928954","https://openalex.org/W4385656513","https://openalex.org/W4389162736","https://openalex.org/W4389166691","https://openalex.org/W4392240059","https://openalex.org/W4392427708","https://openalex.org/W4392746141","https://openalex.org/W4392746501","https://openalex.org/W4393140448","https://openalex.org/W4393145114","https://openalex.org/W4393407316","https://openalex.org/W4401211807","https://openalex.org/W4401211878","https://openalex.org/W4402349286","https://openalex.org/W4402475781","https://openalex.org/W4404955137","https://openalex.org/W4406521098","https://openalex.org/W4407692060"],"related_works":[],"abstract_inverted_index":{"The":[0],"exponential":[1],"growth":[2],"of":[3],"Large":[4],"Language":[5],"Model":[6],"(LLM)":[7],"intensifies":[8],"hardware":[9],"demands":[10],"for":[11],"energy-efficient,":[12],"low-latency":[13],"architectures":[14],"with":[15,66,99,123,152],"scalable":[16],"memory":[17,26,133],"bandwidth.":[18],"While":[19],"3D":[20,118,211],"chiplet":[21],"integration":[22],"addresses":[23],"conventional":[24],"systems\u2019":[25],"wall":[27],"limitations,":[28],"three":[29,90],"critical":[30],"challenges":[31],"persist:":[32],"asymmetric":[33],"compression":[34,106],"constraints":[35],"from":[36,46],"divergent":[37],"sparsity-precision":[38],"requirements":[39],"across":[40],"attention/projection":[41],"layers,":[42],"tier-level":[43],"load":[44],"imbalance":[45,130],"static":[47],"resource":[48,129],"allocation":[49,157],"in":[50,58],"dynamic":[51,136,153],"computation":[52],"patterns,":[53],"and":[54,83,112,131,139,155,161,167,182,199],"coupling-induced":[55],"signal":[56,162],"degradation":[57],"high-density":[59],"TSV":[60,148,192],"networks,":[61],"especially":[62],"under":[63],"LLM-phase-specific":[64],"traffic":[65],"spatiotemporal":[67],"burstiness.":[68],"To":[69],"address":[70],"these,":[71],"we":[72],"present":[73],"H3D-LLM,":[74],"a":[75,93,117,146],"vertically":[76],"heterogeneous":[77],"architecture":[78],"combining":[79],"analog/digital":[80],"Computing-in-Memory":[81],"(CIM)":[82],"Neural":[84],"Processing":[85],"Unit":[86],"(NPU)":[87],"chiplets":[88],"through":[89,135,164],"innovations.":[91],"First,":[92],"Sparse-Aware":[94],"Dynamic":[95],"Execution":[96],"Framework":[97],"(SADEF)":[98],"Precision-Adaptive":[100],"Quantization":[101],"Mechanism":[102],"(PAQM)":[103],"enables":[104],"hardware-aware":[105],"via":[107],"layer-wise":[108],"unstructured":[109],"sparsity":[110],"detection":[111],"INT4/8-FP/BF16":[113],"mixed":[114],"precision.":[115],"Second,":[116],"Spatio-Temporal":[119],"Interleaved":[120],"Parallelism":[121],"(3D-STIP)":[122],"Semantic-Aware":[124],"Tiered":[125],"Storage":[126],"(SATS)":[127],"eliminates":[128],"improves":[132],"efficiency":[134,160,181],"sub-batch":[137],"partitioning":[138,166],"Key-Value":[140],"(KV)":[141],"cache":[142],"aware":[143],"management.":[144],"Third,":[145],"Phase-Adaptive":[147],"Management":[149],"(PATM)":[150],"scheme":[151],"encoding":[154],"cluster-based":[156],"enhances":[158],"inter-connect":[159],"integrity":[163],"runtime-aware":[165],"phase-specific":[168],"dataflow":[169],"scheduling.":[170],"Evaluations":[171],"on":[172],"Llama-7B":[173],"demonstrate":[174],"that":[175],"H3D-LLM":[176],"achieves":[177],"12.3\u00d7":[178],"higher":[179],"energy":[180],"8.4\u00d7":[183],"faster":[184],"inference":[185],"than":[186],"the":[187],"A800":[188],"GPU,":[189],"while":[190],"its":[191],"strategy":[193],"increases":[194],"eye":[195],"height":[196],"by":[197,204],"12%":[198],"reduces":[200],"bit":[201],"error":[202],"rate":[203],"up":[205],"to":[206,209],"60\u00d7":[207],"compared":[208],"na\u00efve":[210],"accelerators.":[212]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-20T00:00:00"}
