{"id":"https://openalex.org/W7160896994","doi":"https://doi.org/10.48550/arxiv.2605.08467","title":"CUDAHercules: Benchmarking Hardware-Aware Expert-level CUDA Optimization for LLMs","display_name":"CUDAHercules: Benchmarking Hardware-Aware Expert-level CUDA Optimization for LLMs","publication_year":2026,"publication_date":"2026-05-08","ids":{"openalex":"https://openalex.org/W7160896994","doi":"https://doi.org/10.48550/arxiv.2605.08467"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08467","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08467","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08467","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135998401","display_name":"Shiyang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135997948","display_name":"Zijian Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zijian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135967529","display_name":"Guangyan Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Guangyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128117458","display_name":"Yuebo Luo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Yuebo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128070792","display_name":"Winson Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Winson","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135972548","display_name":"Yanzhi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yanzhi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135995732","display_name":"Mingyi Hong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Mingyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135912905","display_name":"Caiwen Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Caiwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4004000127315521,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.4004000127315521,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.09719999879598618,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.09350000321865082,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.9632999897003174},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.6324999928474426},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6248999834060669},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5777000188827515},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.551800012588501},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4449999928474426},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4401000142097473}],"concepts":[{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.9632999897003174},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8129000067710876},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.6324999928474426},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6248999834060669},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5777000188827515},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5526999831199646},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.551800012588501},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4449999928474426},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4401000142097473},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.423799991607666},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4059999883174896},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.36230000853538513},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3522999882698059},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.3481000065803528},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.3377000093460083},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.32749998569488525},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.31209999322891235},{"id":"https://openalex.org/C110332635","wikidata":"https://www.wikidata.org/wiki/Q629498","display_name":"Genetic programming","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2599000036716461},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08467","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08467","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08467","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08467","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,14,66,83],"show":[3,121],"promise":[4],"for":[5],"automated":[6,123],"CUDA":[7,33,78,81,124],"programming,":[8],"however":[9],"even":[10],"the":[11,92],"strongest":[12],"coding":[13],"(e.g.,":[15],"Claude-Opus-4.6)":[16],"may":[17],"still":[18],"fall":[19],"short":[20],"of":[21],"expert-level,":[22],"architecture-aware":[23],"optimization.":[24],"We":[25],"introduce":[26],"CUDAHercules,":[27],"a":[28,73],"benchmark":[29],"that":[30,122,142],"evaluates":[31],"generated":[32],"against":[34],"end-to-end":[35,58],"human-expert":[36],"SOTA":[37],"systems.":[38],"It":[39],"spans":[40],"single":[41],"kernels,":[42],"module-level":[43],"operators,":[44],"full":[45],"applications,":[46],"and":[47,54,70,79,86,105,131,139],"unsolved":[48],"challenge":[49],"tasks":[50,59],"across":[51],"Ampere,":[52],"Hopper,":[53],"Blackwell":[55],"GPUs,":[56],"with":[57],"gated":[60],"by":[61],"domain-specific":[62],"semantic":[63],"validators.":[64],"Evaluating":[65],"such":[67],"as":[68],"Claude-Opus-4.6":[69],"GPT-5.4":[71],"shows":[72],"large":[74],"gap":[75],"between":[76],"runnable":[77],"expert":[80,98],"engineering:":[82],"often":[84],"compile":[85],"pass":[87],"tests,":[88],"but":[89],"rarely":[90],"recover":[91],"optimization":[93],"strategies":[94],"needed":[95],"to":[96,146],"match":[97],"performance.":[99],"Application":[100],"semantics":[101],"further":[102],"reduce":[103],"success,":[104],"iterative":[106],"or":[107],"tool-augmented":[108],"feedback":[109],"can":[110],"improve":[111],"correctness":[112],"while":[113],"drifting":[114],"toward":[115],"slow":[116],"fallback":[117],"implementations.":[118],"These":[119],"results":[120],"programming":[125],"remains":[126],"far":[127],"from":[128],"fully":[129],"solved":[130],"requires":[132],"stronger":[133],"hardware":[134,147],"reasoning,":[135],"better":[136],"tool":[137],"use,":[138],"training":[140],"objectives":[141],"connect":[143],"code":[144],"understanding":[145],"architecture-grounded":[148],"intelligence.":[149]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-13T00:00:00"}
