{"id":"https://openalex.org/W7131102867","doi":"https://doi.org/10.1109/cgo68049.2026.11395190","title":"From Threads to Tiles: T2T, a Compiler for CUDA-to-NPU Translation via 2D Vectorization","display_name":"From Threads to Tiles: T2T, a Compiler for CUDA-to-NPU Translation via 2D Vectorization","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7131102867","doi":"https://doi.org/10.1109/cgo68049.2026.11395190"},"language":null,"primary_location":{"id":"doi:10.1109/cgo68049.2026.11395190","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cgo68049.2026.11395190","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125751663","display_name":"Shuaijiang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shuaijiang Li","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126653761","display_name":"Jiacheng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiacheng Zhao","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126599171","display_name":"Ying Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ying Liu","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126601544","display_name":"Shuoming Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuoming Zhang","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126592045","display_name":"Lei Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I9796191","display_name":"University College of Applied Science","ror":"https://ror.org/00f72x493","country_code":"PS","type":"education","lineage":["https://openalex.org/I9796191"]}],"countries":["PS"],"is_corresponding":false,"raw_author_name":"Lei Chen","raw_affiliation_strings":["UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"UCAS,Beijing,China","institution_ids":["https://openalex.org/I9796191"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126653637","display_name":"Yijin Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yijin Li","raw_affiliation_strings":["SKLP, ICT, CAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126616517","display_name":"Yangyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yangyu Zhang","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhicheng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhicheng Li","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126585930","display_name":"Runyu Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Runyu Zhou","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033198789","display_name":"Xiyu Shi","orcid":"https://orcid.org/0000-0001-6174-3383"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiyu Shi","raw_affiliation_strings":["SKLP, ICT, CAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS,Beijing,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095381671","display_name":"Chunwei Xia","orcid":"https://orcid.org/0000-0003-2014-5453"},"institutions":[{"id":"https://openalex.org/I130828816","display_name":"University of Leeds","ror":"https://ror.org/024mrxd33","country_code":"GB","type":"education","lineage":["https://openalex.org/I130828816"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Chunwei Xia","raw_affiliation_strings":["University of Leeds,Leeds,UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Leeds,Leeds,UK","institution_ids":["https://openalex.org/I130828816"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048534698","display_name":"Yuan Wen","orcid":"https://orcid.org/0000-0002-6747-947X"},"institutions":[{"id":"https://openalex.org/I195460627","display_name":"University of Aberdeen","ror":"https://ror.org/016476m91","country_code":"GB","type":"education","lineage":["https://openalex.org/I195460627"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yuan Wen","raw_affiliation_strings":["University of Aberdeen,Aberdeen,UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Aberdeen,Aberdeen,UK","institution_ids":["https://openalex.org/I195460627"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126617640","display_name":"Xiaobing Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaobing Feng","raw_affiliation_strings":["SKLP, ICT, CAS UCAS,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS,Beijing,China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086633294","display_name":"Huimin Cui","orcid":"https://orcid.org/0000-0002-2491-7679"},"institutions":[{"id":"https://openalex.org/I4210101410","display_name":"International Centre for Theoretical Physics Asia-Pacific","ror":"https://ror.org/01z2px678","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210101410","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huimin Cui","raw_affiliation_strings":["SKLP, ICT, CAS UCAS XCORESIGMA CO.,LTD.,Beijing,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"SKLP, ICT, CAS UCAS XCORESIGMA CO.,LTD.,Beijing,China","institution_ids":["https://openalex.org/I4210101410"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5125751663"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.25132458,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"362","last_page":"374"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3456000089645386,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3456000089645386,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.2978000044822693,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.12399999797344208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.6373999714851379},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.621399998664856},{"id":"https://openalex.org/keywords/porting","display_name":"Porting","score":0.5182999968528748},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.4975000023841858},{"id":"https://openalex.org/keywords/instruction-level-parallelism","display_name":"Instruction-level parallelism","score":0.4643000066280365},{"id":"https://openalex.org/keywords/instruction-set","display_name":"Instruction set","score":0.45649999380111694},{"id":"https://openalex.org/keywords/implicit-parallelism","display_name":"Implicit parallelism","score":0.4235999882221222},{"id":"https://openalex.org/keywords/correctness","display_name":"Correctness","score":0.4002000093460083},{"id":"https://openalex.org/keywords/vectorization","display_name":"Vectorization (mathematics)","score":0.39719998836517334}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8632000088691711},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7355999946594238},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.6373999714851379},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.621399998664856},{"id":"https://openalex.org/C106251023","wikidata":"https://www.wikidata.org/wiki/Q851989","display_name":"Porting","level":3,"score":0.5182999968528748},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.4975000023841858},{"id":"https://openalex.org/C140763907","wikidata":"https://www.wikidata.org/wiki/Q2714055","display_name":"Instruction-level parallelism","level":3,"score":0.4643000066280365},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.45649999380111694},{"id":"https://openalex.org/C3543717","wikidata":"https://www.wikidata.org/wiki/Q6007302","display_name":"Implicit parallelism","level":4,"score":0.4235999882221222},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.4059999883174896},{"id":"https://openalex.org/C55439883","wikidata":"https://www.wikidata.org/wiki/Q360812","display_name":"Correctness","level":2,"score":0.4002000093460083},{"id":"https://openalex.org/C41681595","wikidata":"https://www.wikidata.org/wiki/Q7917855","display_name":"Vectorization (mathematics)","level":2,"score":0.39719998836517334},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.3605000078678131},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3472999930381775},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.3345000147819519},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.3172999918460846},{"id":"https://openalex.org/C42992933","wikidata":"https://www.wikidata.org/wiki/Q691169","display_name":"Task parallelism","level":3,"score":0.3160000145435333},{"id":"https://openalex.org/C137364921","wikidata":"https://www.wikidata.org/wiki/Q27929394","display_name":"Parallel programming model","level":3,"score":0.31540000438690186},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.31459999084472656},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.30300000309944153},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.29660001397132874},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C35390924","wikidata":"https://www.wikidata.org/wiki/Q661075","display_name":"Metaprogramming","level":2,"score":0.2809999883174896},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.27320000529289246}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cgo68049.2026.11395190","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cgo68049.2026.11395190","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE/ACM International Symposium on Code Generation and Optimization (CGO)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W38257615","https://openalex.org/W1031578623","https://openalex.org/W1502558230","https://openalex.org/W1590222205","https://openalex.org/W1991592471","https://openalex.org/W2010936531","https://openalex.org/W2041470524","https://openalex.org/W2055813380","https://openalex.org/W2067635581","https://openalex.org/W2080592089","https://openalex.org/W2124556751","https://openalex.org/W2128046183","https://openalex.org/W2157729530","https://openalex.org/W2313116527","https://openalex.org/W2606722458","https://openalex.org/W2906138391","https://openalex.org/W2949609733","https://openalex.org/W3020739840","https://openalex.org/W3028661980","https://openalex.org/W3097652614","https://openalex.org/W3121932930","https://openalex.org/W3122286897","https://openalex.org/W3149591378","https://openalex.org/W3157657667","https://openalex.org/W3157684176","https://openalex.org/W3196320218","https://openalex.org/W3204255478","https://openalex.org/W3205717712","https://openalex.org/W4220693235","https://openalex.org/W4225291709","https://openalex.org/W4246166885","https://openalex.org/W4253012315","https://openalex.org/W4289522463","https://openalex.org/W4312639064","https://openalex.org/W4321446270","https://openalex.org/W4321496380","https://openalex.org/W4392265915","https://openalex.org/W4407857939"],"related_works":[],"abstract_inverted_index":{"CUDA\u2019s":[0,66],"programming":[1],"model,":[2],"exposing":[3],"massive":[4],"parallelism":[5,64,111,126],"via":[6,98,118],"fine-grained":[7],"scalar":[8,67],"threads,":[9],"has":[10],"become":[11],"the":[12,46,62,70,99,125,145,239,244],"de":[13],"facto":[14],"standard":[15],"for":[16,237],"GPU":[17,210],"computing.":[18],"Concurrently,":[19],"NPUs":[20],"are":[21],"emerging":[22],"as":[23,58,81],"highly":[24],"efficient":[25,171],"accelerators,":[26],"but":[27],"their":[28],"architecture":[29],"is":[30,56,168,231],"fundamentally":[31],"different,":[32],"relying":[33],"on":[34,182,207],"coarse-grained,":[35],"explicit":[36,115],"2-D":[37,100,147,229],"tile-based":[38],"instructions.":[39],"This":[40],"creates":[41],"a":[42,77,82,88,106,113,162,190,224,232],"critical":[43],"challenge:":[44],"bridging":[45],"semantic":[47],"gap":[48],"\"From":[49],"Threads":[50],"to":[51,95,150,201,218,228,243],"Tiles\".":[52],"A":[53],"direct":[54],"translation":[55,97,214],"infeasible,":[57],"it":[59],"requires":[60],"lifting":[61,83],"implicit":[63,109],"of":[65,75,139,165,176,193,203,247],"model":[68],"into":[69,112],"explicit,":[71],"multi-dimensional":[72],"vector":[73],"space":[74],"NPUs,":[76],"problem":[78],"we":[79],"formalize":[80],"challenge.This":[84],"paper":[85],"introduces":[86],"T2T,":[87],"compiler":[89],"framework":[90],"that":[91,223],"automates":[92],"this":[93,129],"\"Threads":[94],"Tiles\"":[96],"Vectorization":[101],"technique.":[102],"T2T":[103,180,198],"first":[104],"transforms":[105],"CUDA":[107,160,205,241],"kernel\u2019s":[108],"SIMT":[110],"structured,":[114],"loop":[116],"nest":[117],"our":[119],"Unified":[120],"Parallelism":[121],"Abstraction":[122],"(UPA),":[123],"making":[124],"analyzable.":[127],"From":[128],"representation,":[130],"T2T\u2019s":[131],"core":[132],"vectorization":[133,175,230],"engine":[134],"systematically":[135],"selects":[136],"optimal":[137],"pairs":[138],"loops":[140],"and":[141,157,174,184,211,234],"maps":[142],"them":[143],"onto":[144],"NPU\u2019s":[146],"tile":[148],"instructions":[149],"maximize":[151],"hardware":[152],"utilization.":[153],"To":[154],"ensure":[155],"correctness":[156],"handle":[158],"performance-critical":[159],"features,":[161],"final":[163],"set":[164,192],"semantics-preserving":[166],"optimizations":[167],"applied,":[169],"including":[170],"control-flow":[172],"management":[173],"warp-level":[177],"intrinsics.We":[178],"implement":[179],"based":[181],"Polygeist":[183],"evaluate":[185],"representative":[186],"NPU":[187,248],"architectures.":[188],"On":[189],"diverse":[191],"benchmarks,":[194],"kernels":[195],"translated":[196],"by":[197,216],"achieve":[199],"up":[200,217],"73%":[202],"native":[204],"performance":[206],"an":[208],"A100":[209],"outperform":[212],"baseline":[213],"approaches":[215],"6.9\u00d7.":[219],"Our":[220],"work":[221],"demonstrates":[222],"systematic,":[225],"compiler-driven":[226],"approach":[227],"principled":[233],"high-performance":[235],"path":[236],"porting":[238],"rich":[240],"ecosystem":[242],"evolving":[245],"landscape":[246],"accelerators.":[249]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-02-24T00:00:00"}
