{"id":"https://openalex.org/W4408902464","doi":"https://doi.org/10.1145/3676641.3716252","title":"COMET: Towards Practical W4A4KV4 LLMs Serving","display_name":"COMET: Towards Practical W4A4KV4 LLMs Serving","publication_year":2025,"publication_date":"2025-03-27","ids":{"openalex":"https://openalex.org/W4408902464","doi":"https://doi.org/10.1145/3676641.3716252"},"language":"en","primary_location":{"id":"doi:10.1145/3676641.3716252","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3676641.3716252","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3676641.3716252","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103253476","display_name":"Lian Liu","orcid":"https://orcid.org/0000-0003-2226-2303"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lian Liu","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Zhongguancun Laboratory, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China, University of Chinese Academy of Sciences, Beijing, China, and Zhongguancun Laboratory, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073321754","display_name":"Long Cheng","orcid":"https://orcid.org/0000-0003-1638-059X"},"institutions":[{"id":"https://openalex.org/I153473198","display_name":"North China Electric Power University","ror":"https://ror.org/04qr5t414","country_code":"CN","type":"education","lineage":["https://openalex.org/I153473198"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Long Cheng","raw_affiliation_strings":["North China Electric Power University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"North China Electric Power University, Beijing, China","institution_ids":["https://openalex.org/I153473198"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108262463","display_name":"Hongjun Ren","orcid":null},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haimeng Ren","raw_affiliation_strings":["ShanghaiTech University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Shanghai, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100532497","display_name":"Xu Zhaohui","orcid":"https://orcid.org/0009-0003-8334-6903"},"institutions":[{"id":"https://openalex.org/I30809798","display_name":"ShanghaiTech University","ror":"https://ror.org/030bhh786","country_code":"CN","type":"education","lineage":["https://openalex.org/I30809798"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaohui Xu","raw_affiliation_strings":["ShanghaiTech University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"ShanghaiTech University, Beijing, China","institution_ids":["https://openalex.org/I30809798"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101320367","display_name":"Yudong Pan","orcid":"https://orcid.org/0009-0001-0012-4113"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yudong Pan","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China and University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100707463","display_name":"Mengdi Wang","orcid":"https://orcid.org/0000-0002-7012-2308"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengdi Wang","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023380073","display_name":"Xiaowei Li","orcid":"https://orcid.org/0000-0002-0874-814X"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaowei Li","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China and Zhongguancun Laboratory, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China and Zhongguancun Laboratory, Beijing, China","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016864694","display_name":"Yinhe Han","orcid":"https://orcid.org/0000-0003-0904-6681"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yinhe Han","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100346965","display_name":"Ying Wang","orcid":"https://orcid.org/0000-0001-5172-4736"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Wang","raw_affiliation_strings":["Institute of Computing Technology, CAS, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, CAS, Beijing, China","institution_ids":["https://openalex.org/I4210090176"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5103253476"],"corresponding_institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":7.4438,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.96797503,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"131","last_page":"146"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.9750000238418579,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9657999873161316,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10917","display_name":"Smart Grid Security and Resilience","score":0.9621000289916992,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/comet","display_name":"Comet","score":0.7855089902877808},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.429377019405365},{"id":"https://openalex.org/keywords/astrobiology","display_name":"Astrobiology","score":0.26469606161117554},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.05811244249343872}],"concepts":[{"id":"https://openalex.org/C94081185","wikidata":"https://www.wikidata.org/wiki/Q3559","display_name":"Comet","level":2,"score":0.7855089902877808},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.429377019405365},{"id":"https://openalex.org/C87355193","wikidata":"https://www.wikidata.org/wiki/Q411","display_name":"Astrobiology","level":1,"score":0.26469606161117554},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.05811244249343872}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3676641.3716252","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3676641.3716252","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3676641.3716252","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3676641.3716252","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":32,"referenced_works":["https://openalex.org/W2606555609","https://openalex.org/W2946609015","https://openalex.org/W2981852735","https://openalex.org/W2998617917","https://openalex.org/W3081168214","https://openalex.org/W3094502228","https://openalex.org/W3138516171","https://openalex.org/W3194676777","https://openalex.org/W4220850685","https://openalex.org/W4281758439","https://openalex.org/W4289828024","https://openalex.org/W4304192541","https://openalex.org/W4309591680","https://openalex.org/W4313015712","https://openalex.org/W4321446237","https://openalex.org/W4366341968","https://openalex.org/W4381586827","https://openalex.org/W4386231419","https://openalex.org/W4387321091","https://openalex.org/W4387995158","https://openalex.org/W4389519530","https://openalex.org/W4391045969","https://openalex.org/W4391591383","https://openalex.org/W4392489911","https://openalex.org/W4399051247","https://openalex.org/W4401211704","https://openalex.org/W6600708310","https://openalex.org/W6727099177","https://openalex.org/W6739901393","https://openalex.org/W6778883912","https://openalex.org/W6803712692","https://openalex.org/W6893640197"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W3049149308","https://openalex.org/W3036339064","https://openalex.org/W3132686465","https://openalex.org/W3080418168","https://openalex.org/W2061245293","https://openalex.org/W3031501629","https://openalex.org/W2985497073"],"abstract_inverted_index":{"Quantization":[0],"is":[1],"a":[2,83,111,119,196,201,209],"widely-used":[3],"compression":[4],"technology":[5],"to":[6,41,124,140,157,179,214],"reduce":[7],"the":[8,52,63,72,77,136,142,166],"overhead":[9,143],"of":[10,144,204],"serving":[11,58],"large":[12],"language":[13],"models":[14,194],"(LLMs)":[15],"on":[16,67,195],"terminal":[17],"devices":[18],"and":[19,70,107,127,132,147,175,208],"in":[20],"cloud":[21],"data":[22,122,145],"centers.":[23],"However,":[24],"prevalent":[25],"quantization":[26,87],"methods,":[27],"such":[28,183],"as":[29,184],"8-bit":[30],"weight-activation":[31],"or":[32],"4-bit":[33,95],"weight-only":[34],"quantization,":[35],"achieve":[36,158],"limited":[37],"performance":[38],"improvements":[39],"due":[40],"poor":[42],"support":[43,101,180],"for":[44,51,59,105,130],"low-precision":[45],"(e.g.,":[46],"4-bit)":[47],"activation.":[48],"This":[49],"work,":[50],"first":[53],"time,":[54],"realizes":[55],"practical":[56],"W4A4KV4":[57],"LLMs,":[60],"fully":[61],"utilizing":[62,135],"INT4":[64],"tensor":[65],"cores":[66],"modern":[68],"GPUs":[69],"reducing":[71],"memory":[73],"bottleneck":[74],"caused":[75],"by":[76],"KV":[78],"cache.":[79],"Specifically,":[80],"we":[81,109,150],"propose":[82,151],"novel":[84,120],"fine-grained":[85,152],"mixed-precision":[86,102,121],"algorithm":[88],"(FMPQ)":[89],"that":[90],"compresses":[91],"most":[92],"activations":[93],"into":[94,170],"with":[96],"negligible":[97],"accuracy":[98],"loss.":[99],"To":[100],"matrix":[103],"multiplication":[104],"W4A4":[106],"W4A8,":[108],"develop":[110],"highly":[112],"optimized":[113,167],"W4Ax":[114,168],"kernel.":[115],"Our":[116],"approach":[117],"introduces":[118],"layout":[123],"facilitate":[125],"access":[126],"fast":[128],"dequantization":[129],"activation":[131],"weight":[133],"tensors,":[134],"GPU's":[137],"software":[138],"pipeline":[139],"hide":[141],"loading":[146],"conversion.":[148],"Additionally,":[149],"streaming":[153],"multiprocessor":[154],"(SM)":[155],"scheduling":[156],"load":[159],"balance":[160],"across":[161],"different":[162],"SMs.":[163],"We":[164],"integrate":[165],"kernel":[169],"our":[171],"inference":[172],"framework,":[173],"COMET,":[174],"provide":[176],"efficient":[177],"management":[178],"popular":[181],"LLMs":[182],"LLaMA-3-70B.":[185],"Extensive":[186],"evaluations":[187],"demonstrate":[188],"that,":[189],"when":[190],"running":[191],"LLaMA":[192],"family":[193],"single":[197],"A100-80G-SMX4,":[198],"COMET":[199],"achieves":[200],"kernel-level":[202],"speedup":[203],"2.88x":[205],"over":[206],"cuBLAS":[207],"2.02x":[210],"throughput":[211],"improvement":[212],"compared":[213],"TensorRT-LLM":[215],"from":[216],"an":[217],"end-to-end":[218],"framework":[219],"perspective.":[220]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
