{"id":"https://openalex.org/W7104034764","doi":"https://doi.org/10.1109/tc.2025.3628629","title":"Accelerating LLM Inference via Low-Bit Fine-Grained Quantization Algorithm and Bit-Level Accelerator Co-Design","display_name":"Accelerating LLM Inference via Low-Bit Fine-Grained Quantization Algorithm and Bit-Level Accelerator Co-Design","publication_year":2025,"publication_date":"2025-11-05","ids":{"openalex":"https://openalex.org/W7104034764","doi":"https://doi.org/10.1109/tc.2025.3628629"},"language":null,"primary_location":{"id":"doi:10.1109/tc.2025.3628629","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3628629","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xilong Xie","orcid":"https://orcid.org/0009-0005-9988-2940"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xilong Xie","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Liang Wang","orcid":"https://orcid.org/0000-0002-9537-9986"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Wang","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Limin Xiao","orcid":"https://orcid.org/0000-0001-9438-9181"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Limin Xiao","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Li Ruan","orcid":"https://orcid.org/0000-0002-2386-961X"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Ruan","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tairan Zhang","orcid":"https://orcid.org/0009-0005-4448-7505"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tairan Zhang","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinquan Wang","orcid":"https://orcid.org/0000-0001-6690-8386"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinquan Wang","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yongyue Wang","orcid":"https://orcid.org/0000-0002-6960-6313"},"institutions":[{"id":"https://openalex.org/I4210127816","display_name":"San\u2019an Optoelectronics (China)","ror":"https://ror.org/03hjnd594","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210127816"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yongyue Wang","raw_affiliation_strings":["Jiangsu Shuguang Optoelectronics Company Ltd., Yangzhou, China"],"affiliations":[{"raw_affiliation_string":"Jiangsu Shuguang Optoelectronics Company Ltd., Yangzhou, China","institution_ids":["https://openalex.org/I4210127816"]}]},{"author_position":"last","author":{"id":null,"display_name":"Xiaojian Liao","orcid":"https://orcid.org/0000-0002-7924-9268"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaojian Liao","raw_affiliation_strings":["State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Complex &#x0026; Critical Software Environment (CCSE) and the School of Computer Science and Engineering, Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.77326175,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"75","issue":"2","first_page":"597","last_page":"611"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.3953999876976013,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.3953999876976013,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11209999769926071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.10939999669790268,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.6898999810218811},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.47749999165534973},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.4771000146865845},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.45840001106262207},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.4447999894618988},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.4404999911785126},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.37929999828338623},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.36160001158714294},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.3596000075340271}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8216999769210815},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.6898999810218811},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5012999773025513},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.47749999165534973},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.4771000146865845},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.45840001106262207},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.4447999894618988},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4404999911785126},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.39579999446868896},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.37929999828338623},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3718999922275543},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.36160001158714294},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3596000075340271},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.34700000286102295},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3140999972820282},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3109999895095825},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C2781039887","wikidata":"https://www.wikidata.org/wiki/Q1391724","display_name":"Factor (programming language)","level":2,"score":0.2971000075340271},{"id":"https://openalex.org/C125583679","wikidata":"https://www.wikidata.org/wiki/Q755673","display_name":"Search algorithm","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2922999858856201},{"id":"https://openalex.org/C106516650","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm design","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tc.2025.3628629","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tc.2025.3628629","pdf_url":null,"source":{"id":"https://openalex.org/S157670870","display_name":"IEEE Transactions on Computers","issn_l":"0018-9340","issn":["0018-9340","1557-9956","2326-3814"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Computers","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy","score":0.8526197671890259}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2054095206","https://openalex.org/W2143823686","https://openalex.org/W2963122961","https://openalex.org/W2980200167","https://openalex.org/W2998617917","https://openalex.org/W3100985894","https://openalex.org/W3194676777","https://openalex.org/W3213241618","https://openalex.org/W4239379477","https://openalex.org/W4242118294","https://openalex.org/W4308083739","https://openalex.org/W4313015712","https://openalex.org/W4366341968","https://openalex.org/W4385245566","https://openalex.org/W4387321091","https://openalex.org/W4393147284","https://openalex.org/W4393407021","https://openalex.org/W4405434040","https://openalex.org/W4409248487","https://openalex.org/W4409248488","https://openalex.org/W4409248709","https://openalex.org/W4409362928","https://openalex.org/W4410583578","https://openalex.org/W4415796640"],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1,17],"models":[2],"(LLMs)":[3],"have":[4],"emerged":[5],"as":[6],"one":[7],"of":[8,89,149,197,207],"the":[9,23,79,87,191],"most":[10],"impactful":[11],"and":[12,27,62,84,91,116,167,200,210,215],"transformative":[13],"paradigms":[14],"in":[15],"natural":[16],"processing.":[18],"Despite":[19],"their":[20],"remarkable":[21],"success,":[22],"intensive":[24],"computational":[25],"demands":[26],"substantial":[28],"memory":[29,143],"footprint":[30],"impose":[31],"a":[32,46,71,136,155],"significant":[33],"barrier":[34],"to":[35,49,104,124,140,175],"efficient":[36,142],"LLM":[37,51,161],"inference.":[38],"<p":[39],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[40],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">In":[41],"this":[42],"paper,":[43],"we":[44,68,134,153],"present":[45],"comprehensive":[47],"solution":[48],"improve":[50],"inference":[52],"performance":[53],"under":[54,186],"ultra-low":[55,187],"weight":[56,188],"precision,":[57],"meticulously":[58],"optimized":[59],"through":[60,170],"algorithm":[61,181],"architecture":[63],"co-design.":[64],"To":[65],"achieve":[66],"this,":[67],"first":[69],"propose":[70],"fine-grained":[72],"intra-cluster":[73,99],"bit":[74],"allocation":[75],"method":[76],"that":[77],"partitions":[78],"weights":[80,108],"into":[81],"small":[82],"clusters":[83],"explicitly":[85],"considers":[86],"distribution":[88],"outliers":[90],"salient":[92],"points":[93],"within":[94],"each":[95],"cluster.":[96],"Then,":[97],"an":[98,112],"protection":[100],"mechanism":[101],"is":[102],"proposed":[103,192],"selectively":[105],"preserve":[106],"important":[107],"during":[109],"quantization,":[110],"where":[111],"extended":[113],"integer":[114],"format":[115],"group-wise":[117],"scale":[118],"factor":[119],"search":[120],"are":[121],"further":[122],"introduced":[123],"mitigate":[125],"accuracy":[126,185],"degradation":[127],"caused":[128],"by":[129],"aggressive":[130],"bit-width":[131],"reduction.":[132],"Furthermore,":[133],"develop":[135],"memory-aligned":[137],"encoding":[138],"scheme":[139],"facilitate":[141],"access":[144],"while":[145],"enabling":[146],"flexible":[147],"identification":[148],"mixed-precision":[150],"representations.":[151],"Finally,":[152],"design":[154,166],"lightweight":[156],"bit-level":[157,172,193],"accelerator":[158,194],"for":[159],"low-bit":[160],"inference,":[162],"offering":[163],"simplified":[164],"hardware":[165],"enhanced":[168],"adaptability":[169],"parallel":[171],"computation.":[173],"Compared":[174],"existing":[176],"state-of-the-art":[177],"quantization":[178],"algorithms,":[179],"our":[180],"achieves":[182],"higher":[183],"model":[184],"precision.":[189],"Meanwhile,":[190],"delivers":[195],"speedups":[196],"1.59\u00d7,":[198],"1.38\u00d7,":[199],"1.61\u00d7,":[201],"along":[202],"with":[203],"energy":[204],"efficiency":[205],"improvements":[206],"1.52\u00d7,":[208],"1.42\u00d7,":[209],"1.22\u00d7":[211],"over":[212],"ANT,":[213],"OliVe,":[214],"FineQ,":[216],"respectively.":[217]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-06T00:00:00"}
