{"id":"https://openalex.org/W4408891420","doi":"https://doi.org/10.1145/3689031.3717481","title":"SpInfer: Leveraging Low-Level Sparsity for Efficient Large Language Model Inference on GPUs","display_name":"SpInfer: Leveraging Low-Level Sparsity for Efficient Large Language Model Inference on GPUs","publication_year":2025,"publication_date":"2025-03-26","ids":{"openalex":"https://openalex.org/W4408891420","doi":"https://doi.org/10.1145/3689031.3717481"},"language":"en","primary_location":{"id":"doi:10.1145/3689031.3717481","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689031.3717481","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twentieth European Conference on Computer Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054056254","display_name":"Ruibo Fan","orcid":"https://orcid.org/0009-0006-7492-2069"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ruibo Fan","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101297034","display_name":"Xiangrui Yu","orcid":"https://orcid.org/0009-0005-2478-1512"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiangrui Yu","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010390403","display_name":"Peijie Dong","orcid":"https://orcid.org/0000-0003-1952-4544"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peijie Dong","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100455077","display_name":"Zeyu Li","orcid":"https://orcid.org/0009-0008-4381-0544"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeyu Li","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112721592","display_name":"Gu Gong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu Gong","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018867983","display_name":"Qiang Wang","orcid":"https://orcid.org/0000-0002-2986-967X"},"institutions":[{"id":"https://openalex.org/I204983213","display_name":"Harbin Institute of Technology","ror":"https://ror.org/01yqg2h08","country_code":"CN","type":"education","lineage":["https://openalex.org/I204983213"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiang Wang","raw_affiliation_strings":["Harbin Institute of Technology, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Harbin Institute of Technology, Shenzhen, China","institution_ids":["https://openalex.org/I204983213"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076987000","display_name":"Wei Wang","orcid":"https://orcid.org/0000-0002-4585-4152"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Wei Wang","raw_affiliation_strings":["The Hong Kong University of Science and Technology, Hong Kong SAR"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology, Hong Kong SAR","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100730785","display_name":"Xiaowen Chu","orcid":"https://orcid.org/0000-0001-9745-4372"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xiaowen Chu","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), China and The Hong Kong University of Science and Technology, Hong Kong SAR"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), China and The Hong Kong University of Science and Technology, Hong Kong SAR","institution_ids":["https://openalex.org/I200769079","https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5054056254"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":25.9252,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.99411069,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"243","last_page":"260"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.817394495010376},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7161703109741211},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4736044406890869},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.40192848443984985},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3551292419433594},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3218916058540344},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3179013133049011}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.817394495010376},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7161703109741211},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4736044406890869},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.40192848443984985},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3551292419433594},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3218916058540344},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3179013133049011}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3689031.3717481","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689031.3717481","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twentieth European Conference on Computer Systems","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-151865","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-151865","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G281076837","display_name":null,"funder_award_id":"2024A03J0616","funder_id":"https://openalex.org/F4320323817","funder_display_name":"Universitas Brawijaya"},{"id":"https://openalex.org/G6422904392","display_name":null,"funder_award_id":"62272122","funder_id":"https://openalex.org/F4320323817","funder_display_name":"Universitas Brawijaya"}],"funders":[{"id":"https://openalex.org/F4320323817","display_name":"Universitas Brawijaya","ror":"https://ror.org/01wk3d929"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":55,"referenced_works":["https://openalex.org/W569478347","https://openalex.org/W2002555321","https://openalex.org/W2194775991","https://openalex.org/W2606722458","https://openalex.org/W2612690371","https://openalex.org/W2626696598","https://openalex.org/W2734941459","https://openalex.org/W2808128431","https://openalex.org/W2914631005","https://openalex.org/W2964337156","https://openalex.org/W3034628778","https://openalex.org/W3081168214","https://openalex.org/W3130421533","https://openalex.org/W3132695675","https://openalex.org/W3208099998","https://openalex.org/W4205983429","https://openalex.org/W4226401363","https://openalex.org/W4281758439","https://openalex.org/W4293024985","https://openalex.org/W4309591680","https://openalex.org/W4309672181","https://openalex.org/W4310510250","https://openalex.org/W4310563332","https://openalex.org/W4310895557","https://openalex.org/W4313484599","https://openalex.org/W4320063610","https://openalex.org/W4321636575","https://openalex.org/W4324297016","https://openalex.org/W4327911434","https://openalex.org/W4376167141","https://openalex.org/W4381586827","https://openalex.org/W4384705353","https://openalex.org/W4384705403","https://openalex.org/W4387302777","https://openalex.org/W4387321091","https://openalex.org/W4387595847","https://openalex.org/W4387995158","https://openalex.org/W4388327470","https://openalex.org/W4389576338","https://openalex.org/W4391012746","https://openalex.org/W4391045969","https://openalex.org/W4391375196","https://openalex.org/W4391986945","https://openalex.org/W4392270529","https://openalex.org/W4395020691","https://openalex.org/W4395106472","https://openalex.org/W4399051247","https://openalex.org/W4399447602","https://openalex.org/W4400611982","https://openalex.org/W4400681470","https://openalex.org/W4401211704","https://openalex.org/W4402704384","https://openalex.org/W6677103964","https://openalex.org/W6852962002","https://openalex.org/W6860312956"],"related_works":["https://openalex.org/W2055243143","https://openalex.org/W1986418932","https://openalex.org/W2357796999","https://openalex.org/W4321636575","https://openalex.org/W2741131631","https://openalex.org/W2045526782","https://openalex.org/W4405841813","https://openalex.org/W2156919374","https://openalex.org/W1483472507","https://openalex.org/W1984019423"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"demonstrated":[5],"remarkable":[6],"capabilities,":[7],"but":[8],"their":[9],"immense":[10],"scale":[11],"poses":[12],"significant":[13],"challenges":[14],"in":[15,39,165],"terms":[16],"of":[17,52,59,156,193],"both":[18,166],"memory":[19,167],"and":[20,56,125,146,150,169],"computational":[21,131],"costs.":[22],"While":[23],"unstructured":[24,194],"pruning":[25],"offers":[26],"promising":[27],"solutions":[28],"by":[29,101],"introducing":[30],"sparsity":[31,67,157,182],"to":[32,48,129,144,160,174],"reduce":[33],"resource":[34],"requirements,":[35],"realizing":[36],"its":[37],"benefits":[38],"LLM":[40,83,203],"inference":[41,84,171],"remains":[42],"elusive.":[43],"This":[44],"is":[45],"primarily":[46],"due":[47],"the":[49,57,189],"storage":[50],"overhead":[51,100],"indexing":[53,99],"non-zero":[54],"elements":[55],"inefficiency":[58],"sparse":[60,95],"matrix":[61],"multiplication":[62],"(SpMM)":[63],"kernels":[64],"at":[65,181],"low":[66,185],"levels":[68,158,183],"(around":[69],"50%).":[70],"In":[71],"this":[72],"paper,":[73],"we":[74],"present":[75],"SpInfer,":[76],"a":[77,93,154],"high-performance":[78],"framework":[79],"tailored":[80],"for":[81,107,202],"sparsified":[82],"on":[85],"GPUs.":[86],"SpInfer":[87,113,137,176],"introduces":[88],"Tensor-Core-Aware":[89],"Bitmap":[90,122],"Encoding":[91],"(TCA-BME),":[92],"novel":[94],"format":[96],"that":[97,136],"minimizes":[98],"leveraging":[102],"efficient":[103],"bitmap-based":[104],"indexing,":[105],"optimized":[106,116,179],"GPU":[108],"Tensor":[109],"Core":[110],"architectures.":[111],"Furthermore,":[112],"integrates":[114],"an":[115],"SpMM":[117,141],"kernel":[118],"with":[119,162],"Shared":[120],"Memory":[121],"Decoding":[123],"(SMBD)":[124],"asynchronous":[126],"pipeline":[127],"design":[128],"enhance":[130],"efficiency.":[132],"Experimental":[133],"results":[134],"show":[135],"significantly":[138],"outperforms":[139,177],"state-of-the-art":[140],"implementations":[142],"(up":[143,173],"2.14\u00d7":[145],"2.27\u00d7":[147],"over":[148],"Flash-LLM":[149],"SparTA,":[151],"respectively)":[152],"across":[153],"range":[155],"(30%":[159],"70%),":[161],"substantial":[163],"improvements":[164],"efficiency":[168],"end-to-end":[170],"speed":[172],"1.58\u00d7).":[175],"highly":[178],"cuBLAS":[180],"as":[184,186],"30%,":[187],"marking":[188],"first":[190],"effective":[191],"translation":[192],"pruning's":[195],"theoretical":[196],"advantages":[197],"into":[198],"practical":[199],"performance":[200],"gains":[201],"inference.":[204]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":7}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
