{"id":"https://openalex.org/W7155210327","doi":"https://doi.org/10.48550/arxiv.2604.18616","title":"ARGUS: Agentic GPU Optimization Guided by Data-Flow Invariants","display_name":"ARGUS: Agentic GPU Optimization Guided by Data-Flow Invariants","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7155210327","doi":"https://doi.org/10.48550/arxiv.2604.18616"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.18616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.18616","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025225026","display_name":"Haohui Mai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mai, Haohui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134312945","display_name":"Xiaoyan Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Xiaoyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077337403","display_name":"Xiangyun Ding","orcid":"https://orcid.org/0009-0001-8367-8399"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Xiangyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134295302","display_name":"Daifeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Daifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108927525","display_name":"Qikun Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Qiuchu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134220842","display_name":"Chenzhun Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Chenzhun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134335193","display_name":"Cong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Cong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134353803","display_name":"Jiacheng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Jiacheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134278789","display_name":"Christos Kozyrakis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kozyrakis, Christos","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134331573","display_name":"Binhang Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Binhang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8715999722480774,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8715999722480774,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.03200000151991844,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.015799999237060547,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6191999912261963},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.42480000853538513},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.4162999987602234},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.40779998898506165},{"id":"https://openalex.org/keywords/satisfiability-modulo-theories","display_name":"Satisfiability modulo theories","score":0.40529999136924744},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.40059998631477356},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.3935999870300293},{"id":"https://openalex.org/keywords/digital-subscriber-line","display_name":"Digital subscriber line","score":0.3806000053882599},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.3330000042915344}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8733999729156494},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6191999912261963},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4796000123023987},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.42480000853538513},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4207000136375427},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.4162999987602234},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.40779998898506165},{"id":"https://openalex.org/C164155591","wikidata":"https://www.wikidata.org/wiki/Q2067766","display_name":"Satisfiability modulo theories","level":2,"score":0.40529999136924744},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.40059998631477356},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.3935999870300293},{"id":"https://openalex.org/C201374245","wikidata":"https://www.wikidata.org/wiki/Q104534","display_name":"Digital subscriber line","level":2,"score":0.3806000053882599},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3398999869823456},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3330000042915344},{"id":"https://openalex.org/C106251023","wikidata":"https://www.wikidata.org/wiki/Q851989","display_name":"Porting","level":3,"score":0.3269999921321869},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.32330000400543213},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.3199999928474426},{"id":"https://openalex.org/C162838799","wikidata":"https://www.wikidata.org/wiki/Q596077","display_name":"Counterexample","level":2,"score":0.3133000135421753},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3100000023841858},{"id":"https://openalex.org/C2776937632","wikidata":"https://www.wikidata.org/wiki/Q4117718","display_name":"Program synthesis","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C189950617","wikidata":"https://www.wikidata.org/wiki/Q937228","display_name":"Property (philosophy)","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.2694999873638153},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.25609999895095825},{"id":"https://openalex.org/C200833197","wikidata":"https://www.wikidata.org/wiki/Q333707","display_name":"Compile time","level":3,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.18616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.18616","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.18616","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6932997703552246,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM-based":[0],"coding":[1],"agents":[2,49],"can":[3],"generate":[4],"functionally":[5],"correct":[6],"GPU":[7,29,189,199,212],"kernels,":[8],"yet":[9],"their":[10],"performance":[11,30],"remains":[12],"far":[13],"below":[14],"hand-optimized":[15,223],"libraries":[16],"on":[17,51,195],"critical":[18],"computations":[19],"such":[20],"as":[21],"matrix":[22],"multiplication,":[23],"attention,":[24,203],"and":[25,44,95,113,116,139,162,178,204,226,246],"Mixture-of-Experts":[26],"(MoE).":[27],"Peak":[28],"requires":[31],"coordinated":[32],"reasoning":[33],"over":[34,158,209],"tightly":[35],"coupled":[36],"optimizations,":[37],"including":[38],"tiling,":[39],"shared-memory":[40],"staging,":[41],"software":[42],"pipelining,":[43],"instruction":[45],"scheduling,":[46],"while":[47,98],"existing":[48,231],"rely":[50],"sparse":[52],"pass/fail":[53],"feedback,":[54],"leaving":[55],"them":[56],"unable":[57],"to":[58,107,119,175,237],"diagnose":[59],"global":[60],"constraint":[61],"violations.":[62],"We":[63,192],"present":[64],"Argus,":[65],"an":[66],"agentic":[67,232],"framework":[68],"that":[69],"addresses":[70],"this":[71],"through":[72,111],"data-flow":[73],"invariants:":[74],"compile-time":[75],"specifications":[76],"encoding":[77],"how":[78],"data":[79,112,137],"must":[80],"be":[81],"choreographed":[82],"throughout":[83],"kernel":[84],"execution.":[85],"Argus":[86,194,234],"introduces":[87],"a":[88,159,184],"tile-based,":[89],"Pythonic":[90],"DSL":[91,103],"exposing":[92],"hardware":[93],"instructions":[94],"compiler":[96,130],"policies":[97],"hiding":[99],"low-level":[100],"representations.":[101],"The":[102],"provides":[104],"tag":[105,117],"functions":[106],"propagate":[108],"symbolic":[109],"annotations":[110],"control":[114],"flow,":[115],"assertions":[118],"enforce":[120],"relational":[121],"constraints":[122],"at":[123,152],"use":[124],"sites.":[125],"When":[126],"violations":[127],"occur,":[128],"the":[129,135,196],"returns":[131],"concrete":[132],"counterexamples":[133],"identifying":[134],"thread,":[136],"element,":[138],"program":[140],"point,":[141],"enabling":[142],"dense,":[143],"structured":[144],"feedback":[145],"for":[146,208],"targeted":[147],"fixes.":[148],"Invariants":[149],"are":[150,227],"verified":[151],"compile":[153],"time":[154,213],"via":[155],"abstract":[156],"interpretation":[157],"layout":[160],"algebra":[161],"SMT":[163],"solving,":[164],"with":[165],"zero":[166],"runtime":[167],"overhead.":[168],"An":[169],"in-context":[170],"reinforcement":[171],"learning":[172],"planner":[173],"learns":[174],"select":[176],"optimizations":[177],"synthesize":[179],"effective":[180],"invariants,":[181],"supported":[182],"by":[183],"curated":[185],"knowledge":[186],"base":[187],"of":[188,211,221,243,248],"optimization":[190],"techniques.":[191],"evaluate":[193],"AMD":[197],"MI300X":[198],"across":[200],"GEMM,":[201],"flash":[202],"MoE":[205],"kernels":[206,218],"accounting":[207],"90%":[210,247],"in":[214],"LLM":[215],"inference.":[216],"Generated":[217],"achieve":[219],"99-104%":[220],"state-of-the-art":[222],"assembly":[224],"throughput":[225],"2-1543x":[228],"faster":[229],"than":[230],"systems.":[233],"further":[235],"generalizes":[236],"200":[238],"KernelBench":[239],"tasks,":[240],"solving":[241],"100%":[242],"Level":[244,249],"1":[245],"2":[250],"problems.":[251]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-23T00:00:00"}
