{"id":"https://openalex.org/W7134951474","doi":"https://doi.org/10.1145/3779212.3790176","title":"Insum: Sparse GPU Kernels Simplified and Optimized with Indirect Einsums","display_name":"Insum: Sparse GPU Kernels Simplified and Optimized with Indirect Einsums","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134951474","doi":"https://doi.org/10.1145/3779212.3790176"},"language":null,"primary_location":{"id":"doi:10.1145/3779212.3790176","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3779212.3790176","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061627890","display_name":"Jaeyeon Won","orcid":"https://orcid.org/0000-0002-3082-4348"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jaeyeon Won","raw_affiliation_strings":["CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA"],"raw_orcid":"https://orcid.org/0000-0002-3082-4348","affiliations":[{"raw_affiliation_string":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5060063626","display_name":"Willow Ahrens","orcid":"https://orcid.org/0000-0002-4963-0869"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Willow Ahrens","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"raw_orcid":"https://orcid.org/0000-0002-4963-0869","affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046791216","display_name":"Saman Amarasinghe","orcid":"https://orcid.org/0000-0002-7231-7643"},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saman Amarasinghe","raw_affiliation_strings":["CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA"],"raw_orcid":"https://orcid.org/0000-0002-7231-7643","affiliations":[{"raw_affiliation_string":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024384625","display_name":"Joel Emer","orcid":null},"institutions":[{"id":"https://openalex.org/I63966007","display_name":"Massachusetts Institute of Technology","ror":"https://ror.org/042nb2s44","country_code":"US","type":"education","lineage":["https://openalex.org/I63966007"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joel S. Emer","raw_affiliation_strings":["CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA and Architecture Research Group, NVIDIA, Westford, MA, USA"],"raw_orcid":"https://orcid.org/0000-0002-3459-5466","affiliations":[{"raw_affiliation_string":"CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA and Architecture Research Group, NVIDIA, Westford, MA, USA","institution_ids":["https://openalex.org/I63966007"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5061627890"],"corresponding_institution_ids":["https://openalex.org/I63966007"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.44505358,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"993","last_page":"1006"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.007300000172108412,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.00559999980032444,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6741999983787537},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.598800003528595},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5388000011444092},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4560000002384186},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.43059998750686646},{"id":"https://openalex.org/keywords/sparse-approximation","display_name":"Sparse approximation","score":0.42250001430511475},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.41819998621940613},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.41290000081062317},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4092999994754791}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8162999749183655},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6741999983787537},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6302000284194946},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.598800003528595},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5388000011444092},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4560000002384186},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C124066611","wikidata":"https://www.wikidata.org/wiki/Q28684319","display_name":"Sparse approximation","level":2,"score":0.42250001430511475},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4142000079154968},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.3961000144481659},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.35600000619888306},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3555000126361847},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.3495999872684479},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33719998598098755},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3325999975204468},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.3199000060558319},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.31349998712539673},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C138958017","wikidata":"https://www.wikidata.org/wiki/Q190087","display_name":"Data type","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.28630000352859497},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.271699994802475},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C199519371","wikidata":"https://www.wikidata.org/wiki/Q942695","display_name":"Source lines of code","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3779212.3790176","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3779212.3790176","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790176","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W2011832776","https://openalex.org/W2055312318","https://openalex.org/W2134237243","https://openalex.org/W2590246587","https://openalex.org/W2898123186","https://openalex.org/W2940862705","https://openalex.org/W2954698171","https://openalex.org/W2979310060","https://openalex.org/W3035965352","https://openalex.org/W3109154950","https://openalex.org/W3122286897","https://openalex.org/W3130660608","https://openalex.org/W4200390336","https://openalex.org/W4232315234","https://openalex.org/W4300171661","https://openalex.org/W4308083753","https://openalex.org/W4308262326","https://openalex.org/W4318541582","https://openalex.org/W4321500415","https://openalex.org/W4327911434","https://openalex.org/W4377864779","https://openalex.org/W4389491903","https://openalex.org/W4389500477","https://openalex.org/W4394998532","https://openalex.org/W4404955120","https://openalex.org/W4409310702","https://openalex.org/W4415007445"],"related_works":[],"abstract_inverted_index":{"Programming":[0],"high-performance":[1,41],"sparse":[2,49,73,81,97,144,165],"GPU":[3,42,120,166],"kernels":[4],"is":[5,184],"notoriously":[6],"difficult,":[7],"requiring":[8],"both":[9,48],"substantial":[10],"effort":[11],"and":[12,38,50,83,99,147],"deep":[13],"expertise.":[14],"Sparse":[15],"compilers":[16,54],"aim":[17],"to":[18,57,127,132,150,176],"simplify":[19],"this":[20,64],"process,":[21],"but":[22],"existing":[23],"systems":[24],"fall":[25],"short":[26],"in":[27],"two":[28,142],"key":[29],"ways.":[30],"First,":[31],"they":[32],"are":[33],"primarily":[34],"designed":[35,149],"for":[36,71,122,182],"CPUs":[37],"rarely":[39],"produce":[40],"code.":[43],"Second,":[44],"when":[45],"computations":[46],"involve":[47],"dense":[51,60,102],"regions,":[52],"these":[53,123],"often":[55],"fail":[56],"optimize":[58],"the":[59,114,128],"portions":[61],"effectively.":[62],"In":[63],"paper,":[65],"we":[66,112],"propose":[67],"a":[68,162],"new":[69],"approach":[70,157],"expressing":[72],"computations.":[74],"We":[75,139],"start":[76],"from":[77],"format-agnostic":[78],"Einsums":[79,124],"over":[80],"tensors":[82],"rewrite":[84],"them":[85],"into":[86],"format-conscious":[87],"indirect":[88,106,110,137,154],"Einsums,":[89,111],"which":[90,117],"explicitly":[91],"encode":[92],"format":[93],"information":[94],"by":[95,125,173],"mapping":[96],"data":[98],"metadata":[100],"onto":[101],"tensor":[103],"operations":[104],"through":[105],"indexing.":[107],"To":[108],"execute":[109],"introduce":[113],"Insum":[115,183],"compiler,":[116,130],"generates":[118],"efficient":[119],"code":[121,172,181],"lowering":[126],"PyTorch":[129],"extended":[131],"better":[133],"support":[134],"Tensor":[135],"Core\u2013enabled":[136],"Einsums.":[138,155],"also":[140],"present":[141],"fixed-length":[143],"formats,":[145],"GroupCOO":[146],"BlockGroupCOO,":[148],"fit":[151],"naturally":[152],"with":[153],"Our":[156],"achieves":[158],"1.14\u00d7\u20133.81\u00d7":[159],"speedups":[160],"across":[161],"range":[163],"of":[164,171],"applications":[167],"while":[168],"reducing":[169],"lines":[170],"202\u00d7\u20134491\u00d7":[174],"compared":[175],"hand-written":[177],"implementations.":[178],"The":[179],"source":[180],"publicly":[185],"available":[186],"at":[187],"https://github.com/nullplay/IndirectEinsum.":[188]},"counts_by_year":[],"updated_date":"2026-03-12T06:18:43.230356","created_date":"2026-03-12T00:00:00"}
