{"id":"https://openalex.org/W4409714146","doi":"https://doi.org/10.1145/3723851.3723860","title":"Cross-Vendor GPU Programming: Extending CUDA Beyond NVIDIA","display_name":"Cross-Vendor GPU Programming: Extending CUDA Beyond NVIDIA","publication_year":2025,"publication_date":"2025-03-30","ids":{"openalex":"https://openalex.org/W4409714146","doi":"https://doi.org/10.1145/3723851.3723860"},"language":"en","primary_location":{"id":"doi:10.1145/3723851.3723860","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3723851.3723860","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3723851.3723860","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th Workshop on Heterogeneous Composable and Disaggregated Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3723851.3723860","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049155385","display_name":"Manos Pavlidakis","orcid":"https://orcid.org/0000-0002-8341-3083"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manos Pavlidakis","raw_affiliation_strings":["Spectral Compute, Iraklio, Greece"],"raw_orcid":"https://orcid.org/0000-0002-8341-3083","affiliations":[{"raw_affiliation_string":"Spectral Compute, Iraklio, Greece","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Chris Kitching","orcid":"https://orcid.org/0000-0003-3224-5021"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chris Kitching","raw_affiliation_strings":["Spectral Compute, London, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0003-3224-5021","affiliations":[{"raw_affiliation_string":"Spectral Compute, London, United Kingdom","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115672957","display_name":"Nicholas Tomlinson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nicholas Tomlinson","raw_affiliation_strings":["Spectral Compute, London, United Kingdom"],"raw_orcid":"https://orcid.org/0009-0005-4216-7524","affiliations":[{"raw_affiliation_string":"Spectral Compute, London, United Kingdom","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5115672958","display_name":"Michael S\u00f8ndergaard","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michael S\u00f8ndergaard","raw_affiliation_strings":["Spectral Compute, London, United Kingdom"],"raw_orcid":"https://orcid.org/0009-0000-0304-5341","affiliations":[{"raw_affiliation_string":"Spectral Compute, London, United Kingdom","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.06090493,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"45","last_page":"51"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.9040431976318359},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7952128648757935},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6935968399047852},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.6525206565856934},{"id":"https://openalex.org/keywords/vendor","display_name":"Vendor","score":0.6230382919311523},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.31091952323913574},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.13899505138397217}],"concepts":[{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.9040431976318359},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7952128648757935},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6935968399047852},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.6525206565856934},{"id":"https://openalex.org/C2777338717","wikidata":"https://www.wikidata.org/wiki/Q1762621","display_name":"Vendor","level":2,"score":0.6230382919311523},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.31091952323913574},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.13899505138397217},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C162853370","wikidata":"https://www.wikidata.org/wiki/Q39809","display_name":"Marketing","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3723851.3723860","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3723851.3723860","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3723851.3723860","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th Workshop on Heterogeneous Composable and Disaggregated Systems","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3723851.3723860","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3723851.3723860","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3723851.3723860","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th Workshop on Heterogeneous Composable and Disaggregated Systems","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.5299999713897705}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4409714146.pdf","grobid_xml":"https://content.openalex.org/works/W4409714146.grobid-xml"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W1845641575","https://openalex.org/W2070167224","https://openalex.org/W2080592089","https://openalex.org/W2090584832","https://openalex.org/W2292282166","https://openalex.org/W2295598076","https://openalex.org/W2540279855","https://openalex.org/W2801778672","https://openalex.org/W2905443197","https://openalex.org/W3010640493","https://openalex.org/W3180530691","https://openalex.org/W3195293013","https://openalex.org/W4238014149","https://openalex.org/W4308427633","https://openalex.org/W4393970819","https://openalex.org/W4404787869","https://openalex.org/W4405835629"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2005148983","https://openalex.org/W2792081825"],"abstract_inverted_index":{"The":[0],"dominance":[1],"of":[2,78,116,182,205],"NVIDIA's":[3,92,163],"CUDA":[4,56,81,117,147,152,207],"platform":[5,111],"in":[6,141,187],"GPU":[7,195],"programming":[8],"has":[9],"revolutionized":[10],"fields":[11],"such":[12],"as":[13,97],"machine":[14],"learning":[15],"(ML),":[16],"scientific":[17],"simulations,":[18],"and":[19,35,42,49,70,136,158,197,227,230],"computational":[20],"biology.However,":[21],"its":[22],"exclusivity":[23],"to":[24,54,61,72,103,166,177,218],"NVIDIA":[25,226],"GPUs":[26,121,229],"poses":[27],"significant":[28],"challenges,":[29],"including":[30],"vendor":[31],"lockin,":[32],"higher":[33],"costs,":[34],"reduced":[36],"hardware":[37],"flexibility.Cross-platform":[38],"solutions":[39],"like":[40],"HIP":[41],"SYCL":[43],"often":[44],"require":[45],"extensive":[46],"code":[47,57,125],"rewrites,":[48],"while":[50,161],"they":[51],"provide":[52],"tools":[53,66],"transform":[55],"into":[58],"their":[59],"APIs":[60,160],"reduce":[62],"developer":[63],"effort,":[64],"these":[65,84],"are":[67],"frequently":[68],"incomplete":[69],"fail":[71],"ensure":[73],"seamless":[74,114],"compatibility.Furthermore,":[75],"the":[76,87,98,151,203,206],"absence":[77],"a":[79,110,127,179,221],"formal":[80],"specification":[82],"worsens":[83],"issues,":[85],"creating":[86],"\"CUDA":[88],"dialect":[89,153],"problem\",":[90],"where":[91],"nvcc":[93,138],"compiler":[94,129,132,144],"behavior":[95],"serves":[96],"de":[99],"facto":[100],"standard,":[101],"leading":[102],"incompatibilities":[104],"with":[105,133],"alternative":[106],"compilers.We":[107],"present":[108],"SCALE,":[109],"that":[112],"enables":[113],"execution":[115],"applications":[118],"on":[119],"AMD":[120,167,194,228],"without":[122],"requiring":[123],"any":[124],"modifications.Leveraging":[126],"\"dual\"":[128],"approach-a":[130],"clang-based":[131],"language":[134],"extensions":[135],"an":[137],"mode":[139],"enabled":[140],"SCALE's":[142],"clang":[143],"for":[145,224],"existing":[146],"codebases-SCALE":[148],"effectively":[149],"resolves":[150],"problem.SCALE":[154],"re-implements":[155],"CUDA's":[156],"runtime":[157],"driver":[159],"mapping":[162],"compute":[164],"capabilities":[165],"architectures,":[168],"ensuring":[169],"native-level":[170],"performance.Our":[171],"evaluation":[172],"demonstrates":[173],"SCALE":[174,212],"'s":[175],"capability":[176],"support":[178],"wide":[180],"range":[181],"real-world":[183],"frameworks":[184,200],"implemented":[185],"entirely":[186],"CUDA.SCALE":[188],"provides":[189],"broad":[190],"compatibility":[191],"across":[192],"four":[193],"microarchitectures":[196],"thirteen":[198],"popular":[199],"by":[201],"supporting":[202],"majority":[204],"API,":[208],"reflecting":[209],"60%":[210],"coverage.Finally,":[211],"delivers":[213],"minimal":[214],"performance":[215],"overhead":[216],"compared":[217],"ROCm,":[219],"enabling":[220],"unified":[222],"codebase":[223],"both":[225],"greatly":[231],"simplifying":[232],"cross-platform":[233],"development.":[234]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
