{"id":"https://openalex.org/W4415435760","doi":"https://doi.org/10.1145/3771288","title":"CD-LLM: A Heterogeneous Multi-FPGA System for Batched Decoding of 70B+ LLMs Using a Compute-Dedicated Architecture","display_name":"CD-LLM: A Heterogeneous Multi-FPGA System for Batched Decoding of 70B+ LLMs Using a Compute-Dedicated Architecture","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4415435760","doi":"https://doi.org/10.1145/3771288"},"language":"en","primary_location":{"id":"doi:10.1145/3771288","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3771288","pdf_url":null,"source":{"id":"https://openalex.org/S112809824","display_name":"ACM Transactions on Reconfigurable Technology and Systems","issn_l":"1936-7406","issn":["1936-7406","1936-7414"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Reconfigurable Technology and Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074195618","display_name":"Wenheng Ma","orcid":"https://orcid.org/0000-0003-2349-7286"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenheng Ma","raw_affiliation_strings":["Tsinghua University, Beijing, China","Tsinghua University, China"],"raw_orcid":"https://orcid.org/0000-0003-2349-7286","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020244808","display_name":"Xinhao Yang","orcid":"https://orcid.org/0009-0001-9739-2930"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinhao Yang","raw_affiliation_strings":["Electronic Engineering, Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0009-0001-9739-2930","affiliations":[{"raw_affiliation_string":"Electronic Engineering, Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026831784","display_name":"Shulin Zeng","orcid":"https://orcid.org/0000-0002-1030-3748"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shulin Zeng","raw_affiliation_strings":["Tsinghua University, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0000-0002-1030-3748","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083660349","display_name":"Tengxuan Liu","orcid":"https://orcid.org/0009-0007-8943-1577"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tengxuan Liu","raw_affiliation_strings":["Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0009-0007-8943-1577","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079946965","display_name":"Libo Shen","orcid":"https://orcid.org/0009-0000-1717-2093"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Libo Shen","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, Hong Kong","The Chinese University of Hong Kong, China"],"raw_orcid":"https://orcid.org/0009-0000-1717-2093","affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"The Chinese University of Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023007010","display_name":"Hongyi Wang","orcid":"https://orcid.org/0009-0008-7095-7963"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongyi Wang","raw_affiliation_strings":["Tsinghua University, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0009-0008-7095-7963","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005872704","display_name":"Shiyao Li","orcid":"https://orcid.org/0009-0007-4755-6881"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiyao Li","raw_affiliation_strings":["Electronic Engineering, Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0009-0007-4755-6881","affiliations":[{"raw_affiliation_string":"Electronic Engineering, Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014087671","display_name":"Ke Hong","orcid":"https://orcid.org/0000-0002-5768-6037"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ke Hong","raw_affiliation_strings":["Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","Tsinghua University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0000-0002-5768-6037","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China and Infinigence-AI, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University and Infinigence-AI, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103138440","display_name":"Zhenhua Zhu","orcid":"https://orcid.org/0009-0007-9259-7180"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenhua Zhu","raw_affiliation_strings":["Tsinghua University, Beijing, China","Tsinghua University, China"],"raw_orcid":"https://orcid.org/0009-0007-9259-7180","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086217226","display_name":"Xuefei Ning","orcid":"https://orcid.org/0000-0003-2209-8312"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuefei Ning","raw_affiliation_strings":["Tsinghua University, Beijing, China","Tsinghua University, China"],"raw_orcid":"https://orcid.org/0000-0003-2209-8312","affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062800747","display_name":"Tsung-Yi Ho","orcid":"https://orcid.org/0000-0001-7348-5625"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Tsung-Yi Ho","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, Hong Kong","The Chinese University of Hong Kong, China"],"raw_orcid":"https://orcid.org/0000-0001-7348-5625","affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, Hong Kong","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"The Chinese University of Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015946486","display_name":"Guohao Dai","orcid":"https://orcid.org/0000-0003-0849-3252"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guohao Dai","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China and Infinigence-AI, Beijing, China","Shanghai Jiao Tong University and Infinigence-AI, China"],"raw_orcid":"https://orcid.org/0000-0003-0849-3252","affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China and Infinigence-AI, Beijing, China","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Shanghai Jiao Tong University and Infinigence-AI, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100445061","display_name":"Yu Wang","orcid":"https://orcid.org/0000-0001-6108-5157"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Wang","raw_affiliation_strings":["Electronic Engineering, Tsinghua University, Beijing, China","Tsinghua University, China"],"raw_orcid":"https://orcid.org/0000-0001-6108-5157","affiliations":[{"raw_affiliation_string":"Electronic Engineering, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Tsinghua University, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":13,"corresponding_author_ids":["https://openalex.org/A5074195618"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14626216,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"19","issue":"1","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9943000078201294,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11693","display_name":"Cryptography and Residue Arithmetic","score":0.9872000217437744,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9675999879837036,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7577000260353088},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.6000999808311462},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.5217000246047974},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.48500001430511475},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.47110000252723694},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4587000012397766},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.4496000111103058}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8823000192642212},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7577000260353088},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.6000999808311462},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.5217000246047974},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.48500001430511475},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.47110000252723694},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4587000012397766},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4496000111103058},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.32989999651908875},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3260999917984009},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.31619998812675476},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.30160000920295715},{"id":"https://openalex.org/C98025372","wikidata":"https://www.wikidata.org/wiki/Q477538","display_name":"Systems architecture","level":3,"score":0.29649999737739563},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29010000824928284},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.2858000099658966},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2799000144004822},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C65232700","wikidata":"https://www.wikidata.org/wiki/Q5656403","display_name":"Hardware architecture","level":3,"score":0.2578999996185303}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3771288","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3771288","pdf_url":null,"source":{"id":"https://openalex.org/S112809824","display_name":"ACM Transactions on Reconfigurable Technology and Systems","issn_l":"1936-7406","issn":["1936-7406","1936-7414"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Reconfigurable Technology and Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4815879075","display_name":null,"funder_award_id":"62325405, 62104128, 62203257, 62031017, 62406159, U21B2031","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G616102654","display_name":null,"funder_award_id":"2023YFB4502200","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W646738879","https://openalex.org/W1583837637","https://openalex.org/W2047674856","https://openalex.org/W2322411027","https://openalex.org/W2912416028","https://openalex.org/W2914304175","https://openalex.org/W2990138404","https://openalex.org/W3015468748","https://openalex.org/W4205983429","https://openalex.org/W4211095909","https://openalex.org/W4213153339","https://openalex.org/W4229005866","https://openalex.org/W4233429846","https://openalex.org/W4245659846","https://openalex.org/W4245873525","https://openalex.org/W4300845283","https://openalex.org/W4307934016","https://openalex.org/W4313033857","https://openalex.org/W4321637455","https://openalex.org/W4323654151","https://openalex.org/W4379260375","https://openalex.org/W4383754359","https://openalex.org/W4384561707","https://openalex.org/W4387321091","https://openalex.org/W4393403983","https://openalex.org/W4400682074","https://openalex.org/W4401211813","https://openalex.org/W4403322739","https://openalex.org/W4405936076","https://openalex.org/W4407953972","https://openalex.org/W4408049610","https://openalex.org/W4409248182"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"with":[4,123,248,253,276],"70":[5],"billion":[6],"or":[7,55],"more":[8],"parameters":[9],"are":[10],"increasingly":[11],"being":[12],"deployed":[13,269],"in":[14,70,87,295,302],"cloud-based":[15],"Model-as-a-Service":[16],"(MaaS)":[17],"scenarios.":[18],"To":[19,103],"meet":[20,61],"the":[21,62,80,258,273,315],"demands":[22],"of":[23,44,65,73,83,121,197,235,280,286],"such":[24,181],"deployments,":[25],"MaaS":[26],"providers":[27],"require":[28],"batched":[29,66,88,119],"LLM":[30,67],"decoding":[31,89,120],"systems":[32],"that":[33,173],"can":[34],"deliver":[35],"high":[36],"System":[37],"Throughput":[38],"(STP)":[39],"while":[40],"minimizing":[41],"Total":[42],"Cost":[43],"Ownership":[45],"(TCO).":[46],"However,":[47],"existing":[48],"FPGA-based":[49],"solutions":[50],"predominantly":[51],"focus":[52],"on":[53,94,203,272],"small-batch":[54],"single-batch":[56],"inference,":[57],"which":[58],"fails":[59],"to":[60,75,142,152,192,215,305],"computational":[63],"requirements":[64],"decoding,":[68],"resulting":[69],"performance":[71,176,196],"gaps":[72],"up":[74],"7.96":[76],"\\(\\times\\)":[77,293,300,323,328],".":[78],"Moreover,":[79],"low":[81],"utilization":[82,161,233],"multi-head":[84],"attention":[85,221,243],"operations":[86],"scenarios,":[90],"e.g.,":[91],"only":[92],"3.72%":[93],"A100":[95],"GPUs,":[96],"further":[97],"constrains":[98],"throughput":[99,285],"and":[100,157,185,222,228,240,261,297,326],"inflates":[101],"TCO.":[102,330],"address":[104],"these":[105],"challenges,":[106],"this":[107],"article":[108],"introduces":[109],"CD-LLM":[110,191,231,245,282,312],",":[111],"a":[112,128,137,169,194,210,249,277,284,291,298],"heterogeneous":[113,211,250],"multi-FPGA":[114,213,251],"system":[115,214],"designed":[116,247],"for":[117,237,242,270],"efficient":[118],"LLMs":[122],"70B+":[124],"parameters,":[125],"built":[126],"upon":[127],"C":[129],"ompute-":[130],"D":[131],"edicated":[132],"architecture.":[133],"First,":[134],"we":[135,149,167,208],"propose":[136],"memory-aligned":[138,163],"mixed-precision":[139],"quantization":[140],"engine":[141],"reduce":[143],"workload.":[144],"By":[145,219],"employing":[146],"importance-aware":[147],"quantization,":[148],"compress":[150],"Llama-3.1-70B":[151,274],"an":[153,254,306],"effective":[154],"3.45-bit":[155],"representation":[156],"achieve":[158,216],"72.33%":[159],"bandwidth":[160],"through":[162],"data":[164],"packing.":[165],"Second,":[166],"present":[168],"compute-dedicated":[170,188],"FPGA":[171,256,318],"architecture":[172,189],"maximizes":[174],"peak":[175,195],"by":[177],"leveraging":[178],"FPGA-specific":[179],"resources":[180],"as":[182,257,265],"DSPs,":[183],"BRAMs,":[184],"LUTs.":[186],"The":[187],"enables":[190],"reach":[193],"59.90":[198],"TOPS":[199],"at":[200],"600":[201],"MHz":[202],"U250":[204],"FPGA.":[205],"At":[206],"last,":[207],"introduce":[209],"master-slave":[212],"higher":[217,324],"utilization.":[218],"pipelining":[220],"linear":[223,238],"layer":[224],"computations":[225],"across":[226],"master":[227,259],"slave":[229,266],"FPGAs,":[230],"achieves":[232,283],"rates":[234],"83.08%":[236],"layers":[239],"68.30%":[241],"layers.":[244],"is":[246],"architecture,":[252],"HBM-enabled":[255],"accelerator":[260,319],"eight":[262],"DDR-based":[263],"FPGAs":[264],"accelerators.":[267],"When":[268],"inference":[271],"model":[275],"batch":[278],"size":[279],"256,":[281],"2,721.79":[287],"tokens/s.":[288],"This":[289],"represents":[290],"6.11":[292],"improvement":[294],"STP":[296,325],"4.71":[299],"reduction":[301],"TCO":[303],"compared":[304],"eight-card":[307,317],"RTX3090":[308],"GPU":[309],"system.":[310],"Furthermore,":[311],"substantially":[313],"outperforms":[314],"state-of-the-art":[316],"FlightLLM,":[320],"delivering":[321],"16.15":[322],"14.56":[327],"lower":[329]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-24T00:00:00"}
