{"id":"https://openalex.org/W4411260261","doi":"https://doi.org/10.1145/3729262","title":"Task-Based Tensor Computations on Modern GPUs","display_name":"Task-Based Tensor Computations on Modern GPUs","publication_year":2025,"publication_date":"2025-06-10","ids":{"openalex":"https://openalex.org/W4411260261","doi":"https://doi.org/10.1145/3729262"},"language":"en","primary_location":{"id":"doi:10.1145/3729262","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3729262","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1145/3729262","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047980263","display_name":"Rohan Yadav","orcid":"https://orcid.org/0000-0003-0746-066X"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rohan Yadav","raw_affiliation_strings":["Stanford University, Stanford, USA"],"raw_orcid":"https://orcid.org/0000-0003-0746-066X","affiliations":[{"raw_affiliation_string":"Stanford University, Stanford, USA","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024606205","display_name":"Michael Garland","orcid":"https://orcid.org/0000-0001-6093-7602"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Garland","raw_affiliation_strings":["NVIDIA, Santa Clara, USA"],"raw_orcid":"https://orcid.org/0000-0001-6093-7602","affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029910004","display_name":"Alexander M. Aiken","orcid":"https://orcid.org/0000-0003-3372-9036"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alex Aiken","raw_affiliation_strings":["Stanford University, Stanford, USA"],"raw_orcid":"https://orcid.org/0000-0003-3372-9036","affiliations":[{"raw_affiliation_string":"Stanford University, Stanford, USA","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077152489","display_name":"Michael Bauer","orcid":"https://orcid.org/0000-0001-8928-3032"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Bauer","raw_affiliation_strings":["NVIDIA, Santa Clara, USA"],"raw_orcid":"https://orcid.org/0000-0001-8928-3032","affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5047980263"],"corresponding_institution_ids":["https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":8.6185,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.97552699,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"9","issue":"PLDI","first_page":"396","last_page":"420"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9731000065803528,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.664665937423706},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6293171048164368},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6105965375900269},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.6073932647705078},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.5648670196533203},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5083109736442566},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.17545652389526367},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.17362689971923828},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.163334459066391},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06705087423324585}],"concepts":[{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.664665937423706},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6293171048164368},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6105965375900269},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.6073932647705078},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.5648670196533203},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5083109736442566},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.17545652389526367},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.17362689971923828},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.163334459066391},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06705087423324585},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3729262","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3729262","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3729262","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3729262","pdf_url":null,"source":{"id":"https://openalex.org/S4210216081","display_name":"Proceedings of the ACM on Programming Languages","issn_l":"2475-1421","issn":["2475-1421"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Programming Languages","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W1984689820","https://openalex.org/W2020165759","https://openalex.org/W2049875313","https://openalex.org/W2121893797","https://openalex.org/W2126952393","https://openalex.org/W2160523651","https://openalex.org/W2171399035","https://openalex.org/W2252007067","https://openalex.org/W2294413161","https://openalex.org/W2590246587","https://openalex.org/W2954698171","https://openalex.org/W3035965352","https://openalex.org/W3047091820","https://openalex.org/W3099525000","https://openalex.org/W3099878876","https://openalex.org/W3117137355","https://openalex.org/W3122286897","https://openalex.org/W3136479147","https://openalex.org/W3145506805","https://openalex.org/W4231715478","https://openalex.org/W4240382083","https://openalex.org/W4244157805","https://openalex.org/W4248722156","https://openalex.org/W4251637954","https://openalex.org/W4280495398","https://openalex.org/W4281707342","https://openalex.org/W4327930469","https://openalex.org/W4396817063"],"related_works":["https://openalex.org/W4231775656","https://openalex.org/W2046435967","https://openalex.org/W2383646825","https://openalex.org/W2371018915","https://openalex.org/W2354191502","https://openalex.org/W1972225038","https://openalex.org/W3134658850","https://openalex.org/W2355938171","https://openalex.org/W2780079842","https://openalex.org/W1429949169"],"abstract_inverted_index":{"Domain-specific,":[0],"fixed-function":[1,24,35],"units":[2,25,36,58],"are":[3,103,116,124],"becoming":[4],"increasingly":[5],"common":[6],"in":[7,141],"modern":[8],"processors.":[9],"As":[10],"the":[11,17,79,84,127,169,178,181],"computational":[12],"demands":[13],"of":[14,22,86,106,118,171,180,191],"applications":[15],"evolve,":[16],"capabilities":[18],"and":[19,47,115,120,140,175,195],"programming":[20,63,87,96],"interfaces":[21],"these":[23,57,88],"continue":[26],"to":[27,126],"change.":[28],"NVIDIA\u2019s":[29],"Hopper":[30],"GPU":[31],"architecture":[32,152],"contains":[33],"multiple":[34],"per":[37],"compute":[38],"unit,":[39],"including":[40],"an":[41,48],"asynchronous":[42,49,80,196],"data":[43,193],"movement":[44,194],"unit":[45,52],"(TMA)":[46],"matrix":[50],"multiplication":[51],"(Tensor":[53],"Core).":[54],"Efficiently":[55],"utilizing":[56],"requires":[59],"a":[60,94,104,131,150],"fundamentally":[61],"different":[62],"style":[64],"than":[65],"previous":[66],"architectures;":[67],"programmers":[68],"must":[69],"now":[70],"develop":[71],"warp-specialized":[72],"kernels":[73],"that":[74,111,134,153,160],"orchestrate":[75],"producer-consumer":[76],"pipelines":[77],"between":[78,176],"units.":[81],"To":[82],"manage":[83],"complexity":[85],"new":[89],"architectures,":[90],"we":[91],"introduce":[92],"Cypress,":[93],"task-based":[95],"model":[97],"with":[98,163],"sequential":[99],"semantics.":[100],"Cypress":[101,122,155,166],"programs":[102,123,156,159],"set":[105],"designated":[107],"functions":[108],"called":[109],"tasks":[110,137],"operate":[112],"on":[113,173],"tensors":[114,144],"free":[117],"communication":[119],"synchronization.":[121],"bound":[125],"target":[128],"machine":[129],"through":[130],"mapping":[132],"specification":[133],"describes":[135],"where":[136],"should":[138,145],"run":[139],"which":[142],"memories":[143],"be":[146],"materialized.":[147],"We":[148],"present":[149],"compiler":[151],"lowers":[154],"into":[157],"CUDA":[158],"perform":[161],"competitively":[162],"expert-written":[164],"codes.":[165],"achieves":[167],"0.88x-1.06x":[168],"performance":[170,179],"cuBLAS":[172],"GEMM,":[174],"0.80x-0.98x":[177],"currently":[182],"best-known":[183],"Flash":[184],"Attention":[185],"implementation":[186],"while":[187],"eliminating":[188],"all":[189],"aspects":[190],"explicit":[192],"computation":[197],"from":[198],"application":[199],"code.":[200]},"counts_by_year":[{"year":2026,"cited_by_count":4}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
