{"id":"https://openalex.org/W4414196916","doi":"https://doi.org/10.1109/dac63849.2025.11132627","title":"GoPTX: Fine-grained GPU Kernel Fusion by PTX-level Instruction Flow Weaving","display_name":"GoPTX: Fine-grained GPU Kernel Fusion by PTX-level Instruction Flow Weaving","publication_year":2025,"publication_date":"2025-06-22","ids":{"openalex":"https://openalex.org/W4414196916","doi":"https://doi.org/10.1109/dac63849.2025.11132627"},"language":"en","primary_location":{"id":"doi:10.1109/dac63849.2025.11132627","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132627","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087641894","display_name":"Kan Wu","orcid":"https://orcid.org/0009-0008-5828-847X"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Kan Wu","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070842569","display_name":"Zejia Lin","orcid":"https://orcid.org/0000-0002-7205-4062"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zejia Lin","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116500585","display_name":"Mengyue Xi","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyue Xi","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhongchun Zheng","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongchun Zheng","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023115735","display_name":"Wenxuan Pan","orcid":"https://orcid.org/0009-0007-2510-8666"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxuan Pan","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100757687","display_name":"Xianwei Zhang","orcid":"https://orcid.org/0000-0003-4665-9145"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianwei Zhang","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101633465","display_name":"Yutong Lu","orcid":"https://orcid.org/0000-0001-5315-3375"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutong Lu","raw_affiliation_strings":["Sun Yat-sen University,Guangzhou,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,Guangzhou,China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5087641894"],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":1.163,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.82723805,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.973800003528595,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10052","display_name":"Medical Image Segmentation Techniques","score":0.9671000242233276,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/weaving","display_name":"Weaving","score":0.5770999789237976},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5713000297546387},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5365999937057495},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.5090000033378601},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.49239999055862427},{"id":"https://openalex.org/keywords/control-flow","display_name":"Control flow","score":0.46639999747276306},{"id":"https://openalex.org/keywords/control-flow-graph","display_name":"Control flow graph","score":0.44130000472068787},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4287000000476837},{"id":"https://openalex.org/keywords/program-slicing","display_name":"Program slicing","score":0.42559999227523804}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8551999926567078},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6628999710083008},{"id":"https://openalex.org/C54525549","wikidata":"https://www.wikidata.org/wiki/Q2553445","display_name":"Weaving","level":2,"score":0.5770999789237976},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5713000297546387},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5365999937057495},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.5090000033378601},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.49239999055862427},{"id":"https://openalex.org/C160191386","wikidata":"https://www.wikidata.org/wiki/Q868299","display_name":"Control flow","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C27458966","wikidata":"https://www.wikidata.org/wiki/Q1187693","display_name":"Control flow graph","level":2,"score":0.44130000472068787},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4287000000476837},{"id":"https://openalex.org/C91071405","wikidata":"https://www.wikidata.org/wiki/Q1413145","display_name":"Program slicing","level":3,"score":0.42559999227523804},{"id":"https://openalex.org/C28034677","wikidata":"https://www.wikidata.org/wiki/Q17092530","display_name":"Interleaving","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C489000","wikidata":"https://www.wikidata.org/wiki/Q747385","display_name":"Data flow diagram","level":2,"score":0.40139999985694885},{"id":"https://openalex.org/C88468194","wikidata":"https://www.wikidata.org/wiki/Q1172416","display_name":"Data-flow analysis","level":3,"score":0.3952000141143799},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.38989999890327454},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.3424000144004822},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C2778787235","wikidata":"https://www.wikidata.org/wiki/Q49007","display_name":"Yarn","level":2,"score":0.31610000133514404},{"id":"https://openalex.org/C158100120","wikidata":"https://www.wikidata.org/wiki/Q1931402","display_name":"ANSI C","level":3,"score":0.2987000048160553},{"id":"https://openalex.org/C58013763","wikidata":"https://www.wikidata.org/wiki/Q5754574","display_name":"High-level synthesis","level":3,"score":0.2937999963760376},{"id":"https://openalex.org/C2776190703","wikidata":"https://www.wikidata.org/wiki/Q488148","display_name":"Slicing","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C42812","wikidata":"https://www.wikidata.org/wiki/Q1082910","display_name":"Partition (number theory)","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.2896000146865845},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2865000069141388},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/dac63849.2025.11132627","is_oa":false,"landing_page_url":"https://doi.org/10.1109/dac63849.2025.11132627","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 62nd ACM/IEEE Design Automation Conference (DAC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320316083","display_name":"Tencent","ror":"https://ror.org/00hhjss72"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2080592089","https://openalex.org/W2098274770","https://openalex.org/W2140348470","https://openalex.org/W2153190325","https://openalex.org/W2323693848","https://openalex.org/W2510980549","https://openalex.org/W2794729807","https://openalex.org/W2804032941","https://openalex.org/W3007718266","https://openalex.org/W3092533807","https://openalex.org/W4220818654","https://openalex.org/W4244917406","https://openalex.org/W4246166885","https://openalex.org/W4280633999","https://openalex.org/W4308090436","https://openalex.org/W4321636644","https://openalex.org/W4327911434","https://openalex.org/W4390097634","https://openalex.org/W4392265902","https://openalex.org/W4393407047","https://openalex.org/W4394998532"],"related_works":[],"abstract_inverted_index":{"GPUs":[0],"have":[1,14],"been":[2,15],"heavily":[3],"utilized":[4],"in":[5,42,156],"diverse":[6],"applications,":[7],"and":[8,36,92,112,161],"numerous":[9],"approaches,":[10],"including":[11],"kernel":[12,23,57],"fusion,":[13],"proposed":[16],"to":[17,32,82,118],"boost":[18],"GPU":[19],"efficiency":[20],"through":[21,62],"concurrent":[22,140],"execution.":[24],"However,":[25],"these":[26],"approaches":[27],"generally":[28],"overlook":[29],"the":[30,67,120,138],"opportunities":[31],"mitigate":[33],"warp":[34],"stalls":[35,95],"improve":[37],"instruction":[38,104,110],"level":[39],"parallelism":[40],"(ILP)":[41],"inter-kernel":[43],"resource":[44,150,162],"sharing.":[45],"To":[46],"address":[47],"this":[48],"issue,":[49],"we":[50],"introduce":[51],"GOPTX,":[52],"a":[53,72,102,143],"novel":[54],"design":[55],"for":[56,107],"fusion":[58],"that":[59,86,126],"improves":[60],"ILP":[61],"deliberate":[63],"weaving":[64,105],"instructions":[65,85],"at":[66],"PTX":[68],"level.":[69],"GOPTX":[70,127],"establishes":[71],"merged":[73],"control":[74],"flow":[75],"graph":[76],"(CFG)":[77],"from":[78],"original":[79],"kernels,":[80],"enabling":[81],"interleaving":[83],"of":[84,132,146],"were":[87],"sequentially":[88],"executed":[89],"by":[90],"default":[91],"minimizing":[93],"pipeline":[94],"on":[96],"data":[97],"hazards.":[98],"We":[99],"further":[100],"propose":[101],"latency-aware":[103],"algorithm":[106],"more":[108],"efficient":[109],"scheduling":[111,121],"an":[113,129],"adaptive":[114],"code":[115],"slicing":[116],"method":[117],"enlarge":[119],"space.":[122],"Experimental":[123],"evaluation":[124],"demonstrates":[125],"achieves":[128],"average":[130],"speedup":[131],"$\\mathbf{1":[133],"1.":[134],"2":[135],"\\%}$":[136],"over":[137],"baseline":[139],"execution,":[141],"with":[142],"maximum":[144],"improvement":[145],"23%.":[147],"The":[148],"hardware":[149],"utilization":[151],"statistics":[152],"show":[153],"significant":[154],"enhancements":[155],"eligible":[157],"warps":[158],"per":[159],"cycle":[160],"use.":[163]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
