{"id":"https://openalex.org/W7116350317","doi":"https://doi.org/10.1145/3754598.3754604","title":"Auto-Stencil: Performance-Driven Stencil Optimization with Hardware Feedback for LLMs","display_name":"Auto-Stencil: Performance-Driven Stencil Optimization with Hardware Feedback for LLMs","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116350317","doi":"https://doi.org/10.1145/3754598.3754604"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754604","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754604","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754604","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120904960","display_name":"Quan Deng","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Quan Deng","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7664-6559","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120945939","display_name":"Lin Gan","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lin Gan","raw_affiliation_strings":["Yau Mathematical Sciences Center, Tsinghua University, Beijing, China and Hetao Institute of Mathematics and Interdisciplinary Sciences, Shenzhen, China"],"raw_orcid":"https://orcid.org/0000-0003-1297-4462","affiliations":[{"raw_affiliation_string":"Yau Mathematical Sciences Center, Tsinghua University, Beijing, China and Hetao Institute of Mathematics and Interdisciplinary Sciences, Shenzhen, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hongkun Yu","orcid":"https://orcid.org/0000-0002-2591-6328"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongkun Yu","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-2591-6328","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074313324","display_name":"W. X. Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenlai Zhao","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-1036-4732","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120898923","display_name":"Guangwen Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangwen Yang","raw_affiliation_strings":["Department of Computer Science and Technology, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-8673-8254","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Technology, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5120904960"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64575835,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9090999960899353,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9090999960899353,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.02930000051856041,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.016599999740719795,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stencil","display_name":"Stencil","score":0.9681000113487244},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6108999848365784},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.602400004863739},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5447999835014343},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.5392000079154968},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5303999781608582},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.46309998631477356},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.42969998717308044}],"concepts":[{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.9681000113487244},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8176000118255615},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6108999848365784},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.602400004863739},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5713000297546387},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5447999835014343},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.5392000079154968},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5303999781608582},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.46309998631477356},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.46209999918937683},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.42969998717308044},{"id":"https://openalex.org/C148027188","wikidata":"https://www.wikidata.org/wiki/Q907375","display_name":"Unit testing","level":3,"score":0.42410001158714294},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4162999987602234},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.40290001034736633},{"id":"https://openalex.org/C139571649","wikidata":"https://www.wikidata.org/wiki/Q1156793","display_name":"Program optimization","level":3,"score":0.3686000108718872},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.3215999901294708},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.31200000643730164},{"id":"https://openalex.org/C2778514511","wikidata":"https://www.wikidata.org/wiki/Q1374194","display_name":"Programmer","level":2,"score":0.3050000071525574},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.26499998569488525},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.2515999972820282}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754604","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754604","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754604","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754604","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","score":0.4560023844242096,"display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W1558370006","https://openalex.org/W1598202309","https://openalex.org/W2014748196","https://openalex.org/W2033485113","https://openalex.org/W2074833026","https://openalex.org/W2077143534","https://openalex.org/W2083056254","https://openalex.org/W2100218206","https://openalex.org/W2101105183","https://openalex.org/W2133352531","https://openalex.org/W2141280299","https://openalex.org/W2173545083","https://openalex.org/W2293015622","https://openalex.org/W2528222261","https://openalex.org/W2619305731","https://openalex.org/W2772612468","https://openalex.org/W2775013565","https://openalex.org/W2889543163","https://openalex.org/W2936463352","https://openalex.org/W2996929894","https://openalex.org/W3089631566","https://openalex.org/W3138353446","https://openalex.org/W3203437055","https://openalex.org/W3205077974","https://openalex.org/W4283029140","https://openalex.org/W4285010634","https://openalex.org/W4285327580","https://openalex.org/W4387757648","https://openalex.org/W4391987273","https://openalex.org/W4405756205"],"related_works":[],"abstract_inverted_index":{"Stencil":[0],"computation":[1],"is":[2,167],"an":[3,148],"important":[4],"computing":[5],"pattern":[6],"from":[7],"numerous":[8],"scientific":[9],"simulations,":[10],"and":[11,25,44,97,147],"optimizing":[12],"stencil":[13,80,128],"for":[14,170],"modern":[15],"GPU":[16],"architectures":[17],"demands":[18],"specialized":[19],"expertise":[20],"in":[21],"both":[22,94],"parallel":[23],"programming":[24],"hardware-specific":[26],"optimizations.":[27],"However,":[28],"traditional":[29],"domain-specific":[30],"languages":[31],"based":[32],"auto-tuning":[33],"tools":[34,40],"offer":[35,41],"limited":[36,42],"flexibility;":[37],"generalized":[38],"auto-parallelization":[39],"performance;":[43],"large":[45],"language":[46],"models":[47],"produce":[48],"code":[49],"that":[50,64,88,115],"often":[51],"fails":[52],"to":[53,158],"compile":[54],"or":[55],"underperforms.":[56],"This":[57],"paper":[58],"presents":[59],"Auto-Stencil,":[60],"a":[61,78,84,105,159],"novel":[62],"framework":[63,137],"bridges":[65],"this":[66],"gap":[67],"by":[68],"integrating":[69],"LLMs":[70],"with":[71,83,93],"hardware-aware":[72],"reinforcement":[73],"learning.":[74],"Our":[75],"approach":[76],"combines":[77],"comprehensive":[79],"optimization":[81,145],"dataset":[82],"dual-objective":[85],"training":[86],"methodology":[87],"systematically":[89],"aligns":[90],"model":[91],"outputs":[92],"functional":[95],"correctness":[96],"performance":[98,125],"requirements.":[99],"By":[100],"incorporating":[101],"execution":[102],"feedback":[103],"through":[104],"performance-driven":[106],"reward":[107],"model,":[108],"Auto-Stencil":[109,166],"generates":[110],"highly":[111],"optimized":[112],"CUDA":[113],"implementations":[114],"not":[116],"only":[117],"pass":[118],"unit":[119],"tests":[120],"but":[121],"also":[122],"deliver":[123],"exceptional":[124],"across":[126,154],"diverse":[127],"patterns.":[129],"Experimental":[130],"results":[131,164],"demonstrate":[132],"the":[133,136],"superiority":[134],"of":[135,150,176],"over":[138],"state-of-the-art":[139],"alternatives:":[140],"100%":[141,144],"compilation":[142],"accuracy,":[143],"rate,":[146],"average":[149],"171":[151],"\u00d7":[152],"speedup":[153],"test":[155],"cases":[156],"compared":[157],"single":[160],"CPU":[161],"core.":[162],"The":[163],"show":[165],"particularly":[168],"suitable":[169],"large-scale":[171],"HPC":[172],"workflows":[173],"where":[174],"automation":[175],"complex,":[177],"architecture-specific":[178],"optimizations":[179],"can":[180],"significantly":[181],"reduce":[182],"development":[183],"effort":[184],"while":[185],"maintaining":[186],"excellent":[187],"performance.":[188]},"counts_by_year":[],"updated_date":"2025-12-21T02:06:08.432651","created_date":"2025-12-21T00:00:00"}
