{"id":"https://openalex.org/W7154388104","doi":"https://doi.org/10.48550/arxiv.2604.10187","title":"WaveTune: Wave-aware Bilinear Modeling for Efficient GPU Kernel Auto-tuning","display_name":"WaveTune: Wave-aware Bilinear Modeling for Efficient GPU Kernel Auto-tuning","publication_year":2026,"publication_date":"2026-04-11","ids":{"openalex":"https://openalex.org/W7154388104","doi":"https://doi.org/10.48550/arxiv.2604.10187"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10187","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10187","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10187","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133563566","display_name":"Kaixuan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Kaixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123545898","display_name":"Chutong Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Chutong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121798498","display_name":"Shiyou Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Shiyou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133585472","display_name":"Luping Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Luping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133603154","display_name":"Jian Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Jian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133598899","display_name":"Guangtao Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Guangtao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133606043","display_name":"Cheng Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133562025","display_name":"Guodong Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Guodong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133606670","display_name":"Liping Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Liping","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3246999979019165,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3246999979019165,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.24629999697208405,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.09399999678134918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.6413999795913696},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6272000074386597},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5109999775886536},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.49970000982284546},{"id":"https://openalex.org/keywords/bilinear-interpolation","display_name":"Bilinear interpolation","score":0.4535999894142151},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.43880000710487366},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.41290000081062317},{"id":"https://openalex.org/keywords/runtime-system","display_name":"Runtime system","score":0.3871999979019165},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.35589998960494995}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8425999879837036},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.6413999795913696},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6272000074386597},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5410000085830688},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5109999775886536},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.49970000982284546},{"id":"https://openalex.org/C205203396","wikidata":"https://www.wikidata.org/wiki/Q612143","display_name":"Bilinear interpolation","level":2,"score":0.4535999894142151},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.43880000710487366},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.39320001006126404},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.3871999979019165},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.35589998960494995},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.3224000036716461},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3131999969482422},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.3084999918937683},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3082999885082245},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.3057999908924103},{"id":"https://openalex.org/C202973057","wikidata":"https://www.wikidata.org/wiki/Q7380130","display_name":"Runtime verification","level":3,"score":0.3019999861717224},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3009999990463257},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2831000089645386},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2741999924182892},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C34727166","wikidata":"https://www.wikidata.org/wiki/Q515375","display_name":"Cholesky decomposition","level":3,"score":0.26440000534057617},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.2567000091075897}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10187","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10187","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10187","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10187","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,18,58],"rapid":[1],"adoption":[2],"of":[3,20,48,213],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"has":[8],"made":[9],"GPU":[10,176,184],"inference":[11],"efficiency":[12],"an":[13,134],"increasingly":[14],"critical":[15],"system":[16],"concern.":[17],"runtime":[19,49,96,108,163,173,207],"LLM":[21,243],"workloads":[22],"is":[23],"largely":[24],"dominated":[25],"by":[26,210],"tile-based":[27],"kernels,":[28],"particularly":[29],"General":[30],"Matrix":[31],"Multiplications":[32],"(GEMMs).":[33],"Although":[34],"these":[35,61],"kernels":[36,181],"are":[37],"highly":[38],"optimized,":[39],"their":[40],"performance":[41,93],"remains":[42],"sensitive":[43],"to":[44,67,74,118,127,161,194,200,216],"a":[45,68,89,104,114,147,156,166,236],"large":[46],"space":[47,126],"parameters,":[50],"such":[51],"as":[52],"tile":[53],"sizes":[54],"and":[55,63,83,95,122,155,171,182,198,232,238],"pipeline":[56],"stages.":[57],"interaction":[59],"between":[60,92,229],"parameters":[62],"hardware":[64],"resources":[65],"leads":[66],"non-convex":[69],"optimization":[70],"landscape.":[71],"Existing":[72],"approaches":[73],"parameter":[75],"configuration":[76,125,174,230],"--":[77,87],"including":[78],"search-based":[79],"auto-tuning,":[80],"heuristic":[81],"rules,":[82],"learned":[84],"cost":[85],"models":[86],"face":[88],"fundamental":[90],"trade-off":[91,228],"optimality":[94],"efficiency.":[97],"In":[98],"this":[99],"paper,":[100],"we":[101,112,132,145],"present":[102],"WaveTune,":[103],"wave-aware":[105,136],"framework":[106],"for":[107,175,241],"kernel":[109,142,190],"auto-tuning.":[110],"First,":[111],"introduce":[113],"unified":[115],"mapping":[116],"method":[117],"handle":[119],"input":[120],"diversity":[121],"decompose":[123],"the":[124,226],"manage":[128],"high":[129],"dimensionality.":[130],"Second,":[131],"develop":[133],"analytical":[135],"bilinear":[137],"model":[138],"that":[139,222],"accurately":[140],"predicts":[141],"latency.":[143],"Third,":[144],"design":[146],"sparse":[148],"sampling":[149],"scheme":[150],"based":[151],"on":[152],"wave":[153],"structures":[154],"lightweight":[157],"dual-table":[158],"retrieval":[159],"mechanism":[160],"minimize":[162],"overhead.":[164],"As":[165],"result,":[167],"WaveTune":[168,186,223],"enables":[169],"precise":[170],"efficient":[172],"kernels.":[177],"Across":[178],"three":[179],"representative":[180],"five":[183,211],"architectures,":[185],"consistently":[187],"achieves":[188],"near-optimal":[189],"performance,":[191],"delivering":[192],"up":[193,199],"1.83x":[195],"kernel-level":[196],"speedup":[197],"1.33x":[201],"end-to-end":[202],"TTFT":[203],"reduction,":[204],"while":[205],"reducing":[206],"decision":[208],"overhead":[209],"orders":[212],"magnitude":[214],"compared":[215],"exhaustive":[217],"search.":[218],"These":[219],"results":[220],"demonstrate":[221],"effectively":[224],"eliminates":[225],"traditional":[227],"latency":[231],"execution":[233],"optimality,":[234],"providing":[235],"practical":[237],"robust":[239],"solution":[240],"high-performance":[242],"inference.":[244]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
