{"id":"https://openalex.org/W4414198912","doi":"https://doi.org/10.1109/dac63849.2025.11133200","title":"SSFT: Algorithm and Hardware Co-design for Structured Sparse Fine-Tuning of Large Language Models","display_name":"SSFT: Algorithm and Hardware Co-design for Structured Sparse Fine-Tuning of Large Language Models","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414198912","doi":"https://doi.org/10.1109/dac63849.2025.11133200"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11133200","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11133200","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100337832","display_name":"Miao Yu","orcid":"https://orcid.org/0009-0004-7730-3437"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Miao Yu","raw_affiliation_strings":["National University of Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069683581","display_name":"Trevor E. Carlson","orcid":"https://orcid.org/0000-0001-8742-134X"},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Trevor E. Carlson","raw_affiliation_strings":["National University of Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore","institution_ids":["https://openalex.org/I165932596"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5100337832"],"corresponding_institution_ids":["https://openalex.org/I165932596"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.13474461,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7922999858856201,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7922999858856201,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7771000266075134,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5841000080108643},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4422000050544739},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.42160001397132874},{"id":"https://openalex.org/keywords/execution-time","display_name":"Execution time","score":0.39959999918937683},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.39719998836517334},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.3937999904155731},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.38040000200271606},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3594000041484833}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8510000109672546},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6039000153541565},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5841000080108643},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4422000050544739},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4253999888896942},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.42160001397132874},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.39959999918937683},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.39719998836517334},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3937999904155731},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.38040000200271606},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.37599998712539673},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3594000041484833},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.35420000553131104},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.31139999628067017},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.2976999878883362},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C106516650","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm design","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2750999927520752},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.26820001006126404},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.267300009727478},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.25429999828338623},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.25279998779296875},{"id":"https://openalex.org/C2781235140","wikidata":"https://www.wikidata.org/wiki/Q275131","display_name":"Scratch","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11133200","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11133200","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W2805634098","https://openalex.org/W2912064332","https://openalex.org/W2923014074","https://openalex.org/W3016542674","https://openalex.org/W3045462440","https://openalex.org/W3159727696","https://openalex.org/W3176828726","https://openalex.org/W3206907172","https://openalex.org/W4280496502","https://openalex.org/W4293023328","https://openalex.org/W4375928954","https://openalex.org/W4379527488","https://openalex.org/W4386736489"],"related_works":[],"abstract_inverted_index":{"A":[0],"significant":[1],"number":[2],"of":[3,39,137],"users":[4],"depend":[5],"on":[6,44,64,140,163],"Large":[7],"Language":[8],"Models":[9],"(LLMs)":[10],"for":[11],"downstream":[12],"tasks,":[13],"but":[14],"training":[15],"LLMs":[16],"from":[17],"scratch":[18],"remains":[19],"prohibitively":[20],"expensive.":[21],"Sparse":[22,92],"finetuning":[23],"(SFT)":[24],"has":[25,52],"emerged":[26],"as":[27],"an":[28,121,130],"effective":[29],"strategy":[30],"to":[31,55,69,75,106,126,160],"reduce":[32,107],"both":[33,175],"the":[34,53,115,135,178],"time":[35],"and":[36,111,142,145,152,168,177,188,199],"memory":[37,108],"requirements":[38],"fine-tuning":[40],"LLMs,":[41],"achieving":[42],"accuracy":[43,159],"par":[45],"with":[46,149],"fully":[47],"fine-tuned":[48],"models.":[49],"Although":[50],"SFT":[51,63,116],"potential":[54],"achieve":[56],"superior":[57],"performance":[58],"by":[59,185,196],"minimizing":[60],"computational":[61],"requirements,":[62],"GPUs":[65,81,141,176],"often":[66],"underperforms":[67],"compared":[68],"dense":[70],"algorithms":[71],"like":[72],"LoRA":[73],"due":[74],"sparse":[76],"data":[77],"accesses":[78],"that":[79,155],"modern":[80],"cannot":[82],"efficiently":[83],"handle.":[84],"To":[85,118],"address":[86],"these":[87],"issues,":[88],"we":[89],"propose":[90],"Structured":[91],"FineTuning":[93],"(SSFT).":[94],"It":[95],"comprises":[96],"a":[97],"novel":[98],"algorithm,":[99],"SSFT-Alg,":[100,120],"which":[101],"introduces":[102],"predictable":[103],"sparsity":[104,138],"patterns":[105],"access":[109],"overhead":[110,136],"enhance":[112],"regularity":[113],"in":[114,183],"process.":[117],"support":[119],"accelerator,":[122],"SSFT-Hw,":[123],"is":[124],"proposed":[125],"optimize":[127],"SSFT-Alg":[128],"through":[129],"innovative":[131],"sparsity-aware":[132,180],"design,":[133],"avoiding":[134],"operations":[139],"optimizing":[143],"latency":[144],"energy":[146,194],"efficiency.":[147],"Experiments":[148],"relevant":[150],"models":[151,162],"benchmarks":[153],"demonstrate":[154],"SSFT":[156],"achieves":[157],"comparable":[158],"state-of-the-art":[161,179],"BERT,":[164],"LLaMA":[165,169],"2":[166,170],"7B,":[167],"13B.":[171],"Moreover,":[172],"SSFT-Hw":[173],"outperforms":[174],"transformer":[181],"accelerators":[182],"throughput":[184],"$51.0":[186],"\\times$":[187,198],"$1.32":[189],"\\times$,":[190],"respectively,":[191],"while":[192],"improving":[193],"efficiency":[195],"$19.0":[197],"$1.48":[200],"\\times$.":[201]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
