{"id":"https://openalex.org/W7137806425","doi":"https://doi.org/10.1609/aaai.v40i30.39720","title":"FlashSVD: Memory-Efficient Inference with Streaming for Low-Rank Models","display_name":"FlashSVD: Memory-Efficient Inference with Streaming for Low-Rank Models","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137806425","doi":"https://doi.org/10.1609/aaai.v40i30.39720"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i30.39720","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i30.39720","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39720/43681","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39720/43681","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102696700","display_name":"Zishan Shao","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zishan Shao","raw_affiliation_strings":["Department of Statistical Science, Duke University\nDepartment of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Statistical Science, Duke University\nDepartment of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129669781","display_name":"Yixiao Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yixiao Wang","raw_affiliation_strings":["Department of Statistical Science, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Statistical Science, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129715071","display_name":"Qinsi Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qinsi Wang","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129691793","display_name":"Ting Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ting Jiang","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129690897","display_name":"Zhixu Du","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhixu Du","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000892367","display_name":"Hancheng Ye","orcid":"https://orcid.org/0000-0002-6272-2792"},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hancheng Ye","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129703185","display_name":"Danyang Zhuo","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Danyang Zhuo","raw_affiliation_strings":["Department of Computer Science, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129650278","display_name":"Yiran Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yiran Chen","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129746959","display_name":"Hai \u00a8Helen\u00a8 Li","orcid":null},"institutions":[{"id":"https://openalex.org/I170897317","display_name":"Duke University","ror":"https://ror.org/00py81415","country_code":"US","type":"education","lineage":["https://openalex.org/I170897317"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hai \u00a8Helen\u00a8 Li","raw_affiliation_strings":["Department of Electrical & Computer Engineering, Duke University"],"affiliations":[{"raw_affiliation_string":"Department of Electrical & Computer Engineering, Duke University","institution_ids":["https://openalex.org/I170897317"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5102696700"],"corresponding_institution_ids":["https://openalex.org/I170897317"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01641791,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"30","first_page":"25278","last_page":"25285"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.1712999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.1712999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10790000110864639,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.10700000077486038,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6154000163078308},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.5504999756813049},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.41819998621940613},{"id":"https://openalex.org/keywords/projection","display_name":"Projection (relational algebra)","score":0.4050000011920929},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.3903999924659729},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.3752000033855438},{"id":"https://openalex.org/keywords/byte","display_name":"Byte","score":0.3743000030517578},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.36160001158714294}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7842000126838684},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6154000163078308},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.5504999756813049},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.47440001368522644},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.4050000011920929},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3903999924659729},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.3743000030517578},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.36160001158714294},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.33410000801086426},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3239000141620636},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3059000074863434},{"id":"https://openalex.org/C22789450","wikidata":"https://www.wikidata.org/wiki/Q420904","display_name":"Singular value decomposition","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.296099990606308},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.28029999136924744},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.2750999927520752},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i30.39720","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i30.39720","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39720/43681","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i30.39720","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i30.39720","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/39720/43681","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7137806425.pdf","grobid_xml":"https://content.openalex.org/works/W7137806425.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Singular":[0],"Value":[1],"Decomposition":[2],"(SVD)":[3],"has":[4],"recently":[5],"gained":[6],"traction":[7],"as":[8],"an":[9,96],"effective":[10],"compression":[11,85],"technique":[12],"for":[13,86,102],"large":[14,127],"language":[15],"models":[16],"(LLMs),":[17],"with":[18,62,108,175],"many":[19],"studies":[20],"reporting":[21],"20-80%":[22],"parameter":[23],"reduction":[24,74],"at":[25],"minimal":[26],"accuracy":[27,177],"cost.":[28],"However,":[29],"despite":[30],"reducing":[31],"weight":[32],"memory,":[33,78],"existing":[34],"SVD-based":[35,110],"approaches":[36],"still":[37],"rely":[38],"on":[39],"standard":[40,157],"dense":[41],"CUDA":[42],"kernels":[43,117],"during":[44],"inference,":[45],"which":[46,60],"incur":[47],"substantial-and":[48],"ultimately":[49],"unnecessary-activation":[50],"memory":[51,165,172],"overhead.":[52],"Our":[53],"analysis":[54],"reveals":[55],"that":[56],"this":[57,91],"kernel-induced":[58],"cost,":[59],"grows":[61],"sequence":[63],"length":[64],"and":[65,112,120,143,145,170],"hidden":[66],"size,":[67],"in":[68,75],"worst":[69],"case":[70],"prevents":[71],"any":[72,109],"real":[73],"peak":[76,163],"inference":[77,100],"limiting":[79],"the":[80],"practical":[81],"impact":[82],"of":[83,134,186],"SVD":[84],"on-device":[87],"deployment.":[88],"To":[89],"address":[90],"bottleneck,":[92],"we":[93],"propose":[94],"FlashSVD,":[95],"end-to-end,":[97],"rank-aware":[98],"streaming":[99,131],"framework":[101],"SVD-compressed":[103],"LLMs.":[104,188],"FlashSVD":[105,161],"integrates":[106],"seamlessly":[107],"model":[111],"directly":[113],"fuses":[114],"low-rank":[115,180,187],"projection":[116],"into":[118],"self-attention":[119],"feed-forward":[121],"pipelines.":[122],"This":[123],"design":[124],"avoids":[125],"materializing":[126],"activation":[128,164],"buffers":[129],"by":[130,166,173],"small":[132],"tiles":[133],"truncated":[135],"factors":[136],"through":[137],"on-chip":[138],"SRAM,":[139],"performing":[140],"on-the-fly":[141],"multiplication":[142],"reduction,":[144],"immediately":[146],"evicting":[147],"results\u2013thus":[148],"preserving":[149],"high":[150],"GPU":[151],"occupancy":[152],"without":[153],"introducing":[154],"latency.":[155],"On":[156],"benchmarks":[158],"(e.g.,":[159],"BERT-Base),":[160],"reduces":[162],"up":[167],"to":[168],"70.2%":[169],"transient":[171],"75%,":[174],"zero":[176],"loss":[178],"against":[179],"baselines,":[181],"enabling":[182],"truly":[183],"memory-efficient":[184],"deployment":[185]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
