{"id":"https://openalex.org/W7154729206","doi":"https://doi.org/10.48550/arxiv.2604.14825","title":"Nautilus: An Auto-Scheduling Tensor Compiler for Efficient Tiled GPU Kernels","display_name":"Nautilus: An Auto-Scheduling Tensor Compiler for Efficient Tiled GPU Kernels","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154729206","doi":"https://doi.org/10.48550/arxiv.2604.14825"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14825","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14825","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14825","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133877641","display_name":"Yifan Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yifan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133892043","display_name":"Yuchen Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133910689","display_name":"Matei Budiu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Budiu, Matei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5057462458","display_name":"Sa\u0161a Misailov\u00edc","orcid":"https://orcid.org/0000-0001-7319-8845"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Misailovic, Sasa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9194999933242798,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9194999933242798,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.01549999974668026,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.006500000134110451,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.8339999914169312},{"id":"https://openalex.org/keywords/programmer","display_name":"Programmer","score":0.5914000272750854},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.4586000144481659},{"id":"https://openalex.org/keywords/optimizing-compiler","display_name":"Optimizing compiler","score":0.40470001101493835},{"id":"https://openalex.org/keywords/nautilus","display_name":"Nautilus","score":0.3343999981880188},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.3330000042915344}],"concepts":[{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.8339999914169312},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7681999802589417},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.617900013923645},{"id":"https://openalex.org/C2778514511","wikidata":"https://www.wikidata.org/wiki/Q1374194","display_name":"Programmer","level":2,"score":0.5914000272750854},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.46459999680519104},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.4586000144481659},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.40470001101493835},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3467000126838684},{"id":"https://openalex.org/C2781244455","wikidata":"https://www.wikidata.org/wiki/Q223939","display_name":"Nautilus","level":2,"score":0.3343999981880188},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.3330000042915344},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3292999863624573},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3037000000476837},{"id":"https://openalex.org/C2780728851","wikidata":"https://www.wikidata.org/wiki/Q468402","display_name":"Tile","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.27219998836517334},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2628999948501587},{"id":"https://openalex.org/C2982832238","wikidata":"https://www.wikidata.org/wiki/Q5531640","display_name":"General purpose","level":2,"score":0.26179999113082886}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14825","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14825","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14825","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14825","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"Nautilus,":[2],"a":[3,16,45,51,100],"novel":[4,52],"tensor":[5,21,94],"compiler":[6,95],"that":[7,54],"moves":[8],"toward":[9],"fully":[10],"automated":[11],"math-to-kernel":[12],"optimization.":[13],"Nautilus":[14,49,89,137],"compiles":[15],"high-level":[17,33,58,79],"algebraic":[18],"specification":[19],"of":[20,57,97,103,114],"operators":[22],"into":[23],"efficient":[24],"tiled":[25],"GPU":[26],"kernels.":[27],"Nautilus's":[28,70],"successive":[29],"lowering":[30],"design":[31],"allows":[32],"optimizations,":[34,59,80],"expression":[35],"rewrites,":[36],"and":[37,75,105,126,133,149],"tile":[38,68],"optimizations":[39],"to":[40,119,140,151],"be":[41],"jointly":[42],"applied":[43],"in":[44,77],"single":[46],"end-to-end":[47,93],"system.":[48],"presents":[50],"auto-scheduler":[53,71],"discovers":[55],"sequences":[56],"while":[60,156],"preserving":[61],"the":[62,78,91,111,117,120],"regular":[63],"program":[64],"structure":[65],"needed":[66],"by":[67],"optimizers.":[69],"captures":[72],"complex":[73],"interactions":[74],"trade-offs":[76],"including":[81],"aggressive":[82],"global":[83],"transformations":[84],"like":[85],"advanced":[86],"reduction":[87],"fusion.":[88],"is":[90],"first":[92],"capable":[96],"starting":[98],"from":[99,116],"math-like":[101],"description":[102],"attention":[104],"automatically":[106],"discovering":[107],"FlashAttention-3-like":[108],"kernels,":[109],"offloading":[110],"entire":[112],"burden":[113],"optimization":[115],"programmer":[118],"compiler.":[121],"Across":[122],"five":[123],"transformer-based":[124],"models":[125],"150":[127],"evaluation":[128],"configurations":[129],"on":[130,147,153,164],"NVIDIA":[131],"GH200":[132,148],"RTX":[134,154],"5090":[135],"GPUs,":[136],"achieves":[138],"up":[139,150],"23%":[141],"higher":[142],"throughput":[143],"than":[144],"state-of-the-art":[145],"compilers":[146],"42%":[152],"5090,":[155],"matching":[157],"or":[158],"exceeding":[159],"manually":[160],"written":[161],"cuDNN":[162],"kernels":[163],"many":[165],"long-sequence":[166],"configurations.":[167]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-18T00:00:00"}
