{"id":"https://openalex.org/W4389576338","doi":"https://doi.org/10.14778/3626292.3626303","title":"Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity","display_name":"Flash-LLM: Enabling Cost-Effective and Highly-Efficient Large Generative Model Inference with Unstructured Sparsity","publication_year":2023,"publication_date":"2023-10-01","ids":{"openalex":"https://openalex.org/W4389576338","doi":"https://doi.org/10.14778/3626292.3626303"},"language":"en","primary_location":{"id":"doi:10.14778/3626292.3626303","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3626292.3626303","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005527086","display_name":"Haojun Xia","orcid":"https://orcid.org/0000-0002-9384-0935"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Haojun Xia","raw_affiliation_strings":["University of Sydney"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Sydney","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100459151","display_name":"Zheng Zhen","orcid":"https://orcid.org/0000-0002-8902-9357"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhen Zheng","raw_affiliation_strings":["Alibaba Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100743491","display_name":"Yuchao Li","orcid":"https://orcid.org/0000-0002-3369-2432"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yuchao Li","raw_affiliation_strings":["Alibaba Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051047559","display_name":"Donglin Zhuang","orcid":"https://orcid.org/0000-0003-3355-407X"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Donglin Zhuang","raw_affiliation_strings":["University of Sydney"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Sydney","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069302961","display_name":"Zhongzhu Zhou","orcid":"https://orcid.org/0000-0002-7786-6887"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Zhongzhu Zhou","raw_affiliation_strings":["University of Sydney"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Sydney","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102558097","display_name":"Xiafei Qiu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiafei Qiu","raw_affiliation_strings":["Alibaba Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100355277","display_name":"Yong Li","orcid":"https://orcid.org/0000-0001-5617-1659"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yong Li","raw_affiliation_strings":["Alibaba Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100636012","display_name":"Wei Lin","orcid":"https://orcid.org/0000-0002-3003-0150"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wei Lin","raw_affiliation_strings":["Alibaba Group"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Alibaba Group","institution_ids":["https://openalex.org/I4210095624"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043209884","display_name":"Shuaiwen Leon Song","orcid":"https://orcid.org/0000-0002-8402-1436"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"The University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Shuaiwen Leon Song","raw_affiliation_strings":["University of Sydney"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Sydney","institution_ids":["https://openalex.org/I129604602"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":11.5333,"has_fulltext":false,"cited_by_count":40,"citation_normalized_percentile":{"value":0.99346008,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"17","issue":"2","first_page":"211","last_page":"224"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8284112215042114},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.7057242393493652},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6308759450912476},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.561601996421814},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.526749312877655},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.48972490429878235},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.48381465673446655},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4459027349948883},{"id":"https://openalex.org/keywords/heap","display_name":"Heap (data structure)","score":0.43226489424705505},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.428877055644989},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33288833498954773},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3287070393562317},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.32553625106811523},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.2964988946914673},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.1404842734336853}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8284112215042114},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.7057242393493652},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6308759450912476},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.561601996421814},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.526749312877655},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.48972490429878235},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.48381465673446655},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4459027349948883},{"id":"https://openalex.org/C134757568","wikidata":"https://www.wikidata.org/wiki/Q274089","display_name":"Heap (data structure)","level":2,"score":0.43226489424705505},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.428877055644989},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33288833498954773},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3287070393562317},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.32553625106811523},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2964988946914673},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.1404842734336853},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3626292.3626303","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3626292.3626303","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2002555321","https://openalex.org/W2590246587","https://openalex.org/W2734941459","https://openalex.org/W2884700152","https://openalex.org/W2914631005","https://openalex.org/W2970139027","https://openalex.org/W3037847693","https://openalex.org/W3086105743","https://openalex.org/W3106803824","https://openalex.org/W3129093240","https://openalex.org/W3129831491","https://openalex.org/W3132107458","https://openalex.org/W3177263144","https://openalex.org/W3208099998","https://openalex.org/W3211878177","https://openalex.org/W4283318673","https://openalex.org/W4285225959","https://openalex.org/W4287363917","https://openalex.org/W4289646389","https://openalex.org/W4293023340","https://openalex.org/W4310282800","https://openalex.org/W4310563332","https://openalex.org/W4312903631","https://openalex.org/W4312983671","https://openalex.org/W4321448364","https://openalex.org/W4321636575","https://openalex.org/W4327911434","https://openalex.org/W4380433265"],"related_works":["https://openalex.org/W4242316017","https://openalex.org/W2081658857","https://openalex.org/W4255372513","https://openalex.org/W2127668151","https://openalex.org/W4386302689","https://openalex.org/W2068996722","https://openalex.org/W4319917399","https://openalex.org/W4221139464","https://openalex.org/W3193699965","https://openalex.org/W3004823601"],"abstract_inverted_index":{"With":[0],"the":[1,42,51,69,89,108,116,154,211],"fast":[2],"growth":[3],"of":[4,92,111,221],"parameter":[5],"size,":[6],"it":[7],"becomes":[8],"increasingly":[9],"challenging":[10],"to":[11,35,130,152,241],"deploy":[12],"large":[13,20,84],"generative":[14,85,112],"models":[15],"as":[16],"they":[17],"typically":[18],"require":[19],"GPU":[21,38],"memory":[22,39,156],"consumption":[23],"and":[24,41,81,139,196,216,223,243,248],"massive":[25],"computation.":[26],"Unstructured":[27],"model":[28,48,86,113],"pruning":[29],"has":[30],"been":[31],"a":[32,136],"common":[33],"approach":[34],"reduce":[36],"both":[37],"footprint":[40],"overall":[43],"computation":[44],"while":[45,159],"retaining":[46],"good":[47],"accuracy.":[49],"However,":[50],"existing":[52],"solutions":[53],"do":[54],"not":[55,165],"provide":[56],"an":[57,178,219],"efficient":[58,83,192],"support":[59,91],"for":[60,78,121,142,167,182,191,233],"handling":[61],"unstructured":[62,93,143,186],"sparsity":[63,94],"on":[64,68,95,103,170,174,230],"modern":[65],"GPUs,":[66],"especially":[67],"highly-structured":[70],"tensor":[71,100,123,171,183],"core":[72,184],"hardware.":[73],"Therefore,":[74],"we":[75,134,176],"propose":[76,135],"Flash-LLM":[77,208,238],"enabling":[79],"low-cost":[80],"highly":[82,98],"inference":[87,114,254],"with":[88,251],"sophisticated":[90],"high-performance":[96],"but":[97],"restrictive":[99],"cores.":[101,172],"Based":[102,173],"our":[104],"key":[105],"observation":[106],"that":[107,163,202],"main":[109],"bottleneck":[110,158],"is":[115,151],"several":[117],"skinny":[118],"matrix":[119,145],"multiplications":[120],"which":[122],"cores":[124],"would":[125],"be":[126],"significantly":[127,209,252],"under-utilized":[128],"due":[129],"low":[131],"computational":[132],"intensity,":[133],"general":[137],"Load-as-Sparse":[138],"Compute-as-Dense":[140],"methodology":[141],"sparse":[144,193],"multiplication":[146],"(SpMM).":[147],"The":[148],"basic":[149],"insight":[150],"address":[153],"significant":[155],"bandwidth":[157],"tolerating":[160],"redundant":[161],"computations":[162],"are":[164],"critical":[166],"end-to-end":[168,227],"performance":[169],"this,":[175],"design":[177],"effective":[179],"software":[180],"framework":[181,228],"based":[185],"SpMM,":[187],"leveraging":[188],"on-chip":[189],"resources":[190],"data":[194],"extraction":[195],"computation/memory-access":[197],"overlapping.":[198],"Extensive":[199],"evaluations":[200],"demonstrate":[201],"(1)":[203],"at":[204],"SpMM":[205],"kernel":[206],"level,":[207],"outperforms":[210],"state-of-the-art":[212],"library,":[213],"i.e.,":[214],"Sputnik":[215],"SparTA":[217],"by":[218],"average":[220],"2.9X":[222],"1.5X,":[224],"respectively.(2)":[225],"At":[226],"level":[229],"OPT-30B/66B/175B":[231],"models,":[232],"tokens":[234],"per":[235],"GPU-second":[236],",":[237],"achieves":[239],"up":[240],"3.8X":[242],"3.6X":[244],"improvement":[245],"over":[246],"DeepSpeed":[247],"FasterTransformer,":[249],"respectively,":[250],"lower":[253],"cost.":[255]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":25},{"year":2024,"cited_by_count":10}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
