{"id":"https://openalex.org/W7131127438","doi":"https://doi.org/10.1109/cgo68049.2026.11395194","title":"Hexcute: A Compiler Framework for Automating Layout Synthesis in GPU Programs","display_name":"Hexcute: A Compiler Framework for Automating Layout Synthesis in GPU Programs","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7131127438","doi":"https://doi.org/10.1109/cgo68049.2026.11395194"},"language":null,"primary_location":{"id":"doi:10.1109/cgo68049.2026.11395194","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cgo68049.2026.11395194","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108698354","display_name":"Xiao Zhang","orcid":"https://orcid.org/0009-0000-0914-5669"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xiao Zhang","raw_affiliation_strings":["University of Toronto,NVIDIA,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Toronto,NVIDIA,Toronto,Canada","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126723069","display_name":"Yaoyao Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Yaoyao Ding","raw_affiliation_strings":["University of Toronto,NVIDIA Vector Institute,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Toronto,NVIDIA Vector Institute,Toronto,Canada","institution_ids":["https://openalex.org/I4210127509"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126703507","display_name":"Bolin Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Bolin Sun","raw_affiliation_strings":["University of Toronto,NVIDIA,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Toronto,NVIDIA,Toronto,Canada","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126711684","display_name":"Yang Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yang Hu","raw_affiliation_strings":["NVIDIA,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA,Toronto,Canada","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076969648","display_name":"Tatiana Shpeisman","orcid":"https://orcid.org/0000-0002-4225-8734"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Tatiana Shpeisman","raw_affiliation_strings":["NVIDIA,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA,Toronto,Canada","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5007585346","display_name":"Gennady Pekhimenko","orcid":"https://orcid.org/0000-0002-3839-0919"},"institutions":[{"id":"https://openalex.org/I4210127509","display_name":"Vector Institute","ror":"https://ror.org/03kqdja62","country_code":"CA","type":"facility","lineage":["https://openalex.org/I4210127509"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Gennady Pekhimenko","raw_affiliation_strings":["University of Toronto,NVIDIA Vector Institute,Toronto,Canada"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Toronto,NVIDIA Vector Institute,Toronto,Canada","institution_ids":["https://openalex.org/I4210127509"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18205936,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"630","last_page":"643"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8492000102996826,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8492000102996826,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.06859999895095825,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.016599999740719795,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.7728999853134155},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7717999815940857},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.7182999849319458},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.6118999719619751},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4767000079154968},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.4616999924182892},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.4498000144958496},{"id":"https://openalex.org/keywords/optimizing-compiler","display_name":"Optimizing compiler","score":0.3815999925136566}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9075000286102295},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.7728999853134155},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7717999815940857},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.7182999849319458},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6693000197410583},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.6118999719619751},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.527400016784668},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4767000079154968},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.4616999924182892},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.4498000144958496},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.3815999925136566},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.33869999647140503},{"id":"https://openalex.org/C160191386","wikidata":"https://www.wikidata.org/wiki/Q868299","display_name":"Control flow","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.28380000591278076},{"id":"https://openalex.org/C173404611","wikidata":"https://www.wikidata.org/wiki/Q528588","display_name":"Constraint programming","level":3,"score":0.2777999937534332},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C76782552","wikidata":"https://www.wikidata.org/wiki/Q110546","display_name":"Just-in-time compilation","level":3,"score":0.2581000030040741},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2513999938964844},{"id":"https://openalex.org/C19024347","wikidata":"https://www.wikidata.org/wiki/Q211496","display_name":"High-level programming language","level":3,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cgo68049.2026.11395194","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cgo68049.2026.11395194","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1536680647","https://openalex.org/W1970075481","https://openalex.org/W2000335122","https://openalex.org/W2042264308","https://openalex.org/W2110195531","https://openalex.org/W2111734760","https://openalex.org/W2171399035","https://openalex.org/W2194775991","https://openalex.org/W2757501351","https://openalex.org/W2804032941","https://openalex.org/W2953303875","https://openalex.org/W2954698171","https://openalex.org/W2963563691","https://openalex.org/W3012249773","https://openalex.org/W4244482609","https://openalex.org/W4251637954","https://openalex.org/W4281932226","https://openalex.org/W4312933868","https://openalex.org/W4318541538","https://openalex.org/W4318541647","https://openalex.org/W4327911434","https://openalex.org/W4327930469","https://openalex.org/W4380874786","https://openalex.org/W4387064014","https://openalex.org/W4387321091","https://openalex.org/W4396827130","https://openalex.org/W4408029577","https://openalex.org/W4411260261","https://openalex.org/W4417242273"],"related_works":[],"abstract_inverted_index":{"Efficient":[0],"GPU":[1,17,68,86],"programming":[2,45,60,69,132],"is":[3,23],"crucial":[4],"for":[5,91],"achieving":[6],"high":[7],"performance":[8,15,157],"in":[9,85],"deep":[10],"learning":[11],"(DL)":[12],"applications.":[13],"The":[14,32],"of":[16,146,158,174,191,199],"programs":[18],"depends":[19],"on":[20,37,75,164,207,211],"how":[21],"data":[22],"parallelized":[24],"across":[25],"threads":[26],"and":[27,50,82,93,105,123,134,149,162,167,209],"arranged":[28],"within":[29],"memory":[30],"subsystems.":[31],"mapping":[33],"functions":[34],"describing":[35],"tensors":[36],"GPUs":[38],"are":[39,96],"known":[40],"as":[41,48,72,129],"tensor":[42],"layouts.":[43,66],"Low-level":[44],"frameworks,":[46],"such":[47,71],"CUTLASS":[49],"Hidet,":[51],"provide":[52],"expressive":[53],"layout":[54,115,127],"abstractions":[55],"but":[56],"often":[57],"require":[58],"considerable":[59],"effort":[61],"to":[62,78,99,179,204],"manually":[63],"specify":[64],"optimal":[65,147],"High-level":[67],"languages,":[70],"Triton,":[73],"rely":[74],"compiler":[76,111],"heuristics":[77,90],"generate":[79],"dataflow,":[80],"layouts,":[81],"pipelining":[83,94],"strategies":[84,95],"programs.":[87],"However,":[88],"the":[89,156,172,196],"dataflow":[92,122],"not":[97],"generalizable":[98],"complex":[100],"operators.":[101],"To":[102],"balance":[103],"expressiveness":[104],"programmability,":[106],"we":[107],"propose":[108],"Hexcute,":[109],"a":[110,130,138,212],"framework":[112],"that":[113,153],"automates":[114],"synthesis":[116,128],"while":[117,170],"providing":[118],"explicit":[119],"control":[120],"over":[121,193],"pipelining.":[124],"Hexcute":[125,154,186,201],"formalizes":[126],"constraint":[131],"problem":[133],"solves":[135],"it":[136],"with":[137],"type-inference-based":[139],"algorithm.":[140],"This":[141],"approach":[142],"enables":[143],"systematic":[144],"exploration":[145],"layouts":[148],"instructions.Our":[150],"evaluation":[151],"shows":[152],"matches":[155],"libraries":[159],"like":[160],"cuBLAS":[161],"FlashAttention":[163],"GEMM,":[165],"Attention,":[166],"their":[168],"variants,":[169],"reducing":[171],"amount":[173],"code":[175],"by":[176],"1.27\u00d7-7.94\u00d7":[177],"compared":[178],"CUTLASS.":[180],"For":[181],"mixed-type":[182],"mixture-of-experts":[183],"(MoE)":[184],"operators,":[185],"achieves":[187],"an":[188],"average":[189],"speedup":[190,206],"6.46\u00d7":[192],"Triton.":[194],"In":[195],"end-to-end":[197],"evaluations":[198],"vLLM,":[200],"delivers":[202],"up":[203],"2.60\u00d7":[205],"DeepSeek-R1-AWQ":[208],"2.04\u00d7":[210],"Mamba-based":[213],"model.":[214]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-02T00:00:00"}
