{"id":"https://openalex.org/W6922225899","doi":"https://doi.org/10.11588/heidok.00033824","title":"Automated Partitioning of CUDA Kernels for Multi-GPU Systems","display_name":"Automated Partitioning of CUDA Kernels for Multi-GPU Systems","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W6922225899","doi":"https://doi.org/10.11588/heidok.00033824"},"language":"en","primary_location":{"id":"pmh:oai:archiv.ub.uni-heidelberg.de:33824","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306402333","display_name":"heiDOK (Heidelberg University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I223822909","host_organization_name":"Heidelberg University","host_organization_lineage":["https://openalex.org/I223822909"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"Dissertation"},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Braun, Lorenz","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Braun, Lorenz","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.42461381,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7921000123023987,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7921000123023987,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.05299999937415123,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.022700000554323196,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.6664999723434448},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6319000124931335},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.59170001745224},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5824999809265137},{"id":"https://openalex.org/keywords/workstation","display_name":"Workstation","score":0.5458999872207642},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4068000018596649},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.40619999170303345},{"id":"https://openalex.org/keywords/graph-partition","display_name":"Graph partition","score":0.40230000019073486},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.3677000105381012}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8694999814033508},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6840999722480774},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.6664999723434448},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6319000124931335},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.59170001745224},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5824999809265137},{"id":"https://openalex.org/C67953723","wikidata":"https://www.wikidata.org/wiki/Q192525","display_name":"Workstation","level":2,"score":0.5458999872207642},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4068000018596649},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.40619999170303345},{"id":"https://openalex.org/C48903430","wikidata":"https://www.wikidata.org/wiki/Q491370","display_name":"Graph partition","level":3,"score":0.40230000019073486},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3677000105381012},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.35749998688697815},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3538999855518341},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3384999930858612},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.335999995470047},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.322299987077713},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.32019999623298645},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3199999928474426},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3034999966621399},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.29260000586509705},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.2757999897003174},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.2685000002384186},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C2781039887","wikidata":"https://www.wikidata.org/wiki/Q1391724","display_name":"Factor (programming language)","level":2,"score":0.2567000091075897}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:archiv.ub.uni-heidelberg.de:33824","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306402333","display_name":"heiDOK (Heidelberg University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I223822909","host_organization_name":"Heidelberg University","host_organization_lineage":["https://openalex.org/I223822909"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"Dissertation"},{"id":"doi:10.11588/heidok.00033824","is_oa":true,"landing_page_url":"https://doi.org/10.11588/heidok.00033824","pdf_url":null,"source":{"id":"https://openalex.org/S7407051545","display_name":"University Library Heidelberg","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:archiv.ub.uni-heidelberg.de:33824","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306402333","display_name":"heiDOK (Heidelberg University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I223822909","host_organization_name":"Heidelberg University","host_organization_lineage":["https://openalex.org/I223822909"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"Dissertation"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.913058876991272,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Supercomputers":[0],"and":[1,25,51,56,114,168,220,302,316,322,336,353],"powerful":[2],"workstations":[3],"with":[4,111,142],"multiple":[5],"GPUs":[6,14],"have":[7],"become":[8],"the":[9,12,64,78,92,101,132,147,157,165,180,188,225,251,257,272,275,294,329,337,341],"state":[10],"of":[11,34,62,66,80,83,95,100,109,190,197,246,256,274,328],"art.":[13],"are":[15,261,277,304,311,317],"favored":[16],"for":[17,28,126,172,267,319,351,358],"their":[18],"immense":[19],"computational":[20],"power,":[21],"high":[22],"memory":[23],"bandwidth":[24],"energy":[26],"efficiency":[27],"highly":[29],"parallel":[30],"workloads.":[31],"The":[32,60,86,98,237,309,326],"translation":[33],"mathematical":[35],"problems":[36],"to":[37,200,233,248,263,282,349,356],"multi-GPU":[38],"compute":[39,93,181],"kernels":[40,67,199,292],"has":[41,68],"already":[42],"been":[43],"solved":[44],"in":[45,73],"research":[46],"by":[47,90,243],"using":[48,334],"domain-specific":[49],"languages":[50],"libraries":[52],"or":[53],"clever":[54],"analysis":[55],"transformation":[57],"during":[58],"compilation.":[59],"process":[61],"optimizing":[63],"partitioning":[65,82,159],"received":[69],"very":[70],"little":[71],"attention":[72],"research.":[74],"This":[75,222],"work":[76,185],"explores":[77],"viability":[79],"automated":[81,158],"GPU":[84],"kernels.":[85],"problem":[87,171,189],"is":[88,103,160,218,224,229,287,333],"approached":[89],"modeling":[91],"graph":[94],"selected":[96],"applications.":[97],"Execution":[99],"applications":[102,174],"simulated":[104],"on":[105,119,176,203,215,306,313],"a":[106,122,244,265,288],"wide":[107],"range":[108],"systems":[110],"different":[112],"interconnects":[113],"GPUs.":[115,308],"To":[116],"cut":[117],"down":[118],"simulation":[120,133],"time,":[121],"simulator":[123],"was":[124],"developed":[125],"this":[127,170,184],"specific":[128],"use":[129,195],"case.":[130],"With":[131],"results,":[134],"simple":[135],"but":[136],"per-case":[137],"individual":[138],"models":[139,310],"were":[140],"created,":[141],"which":[143,286],"we":[144],"show":[145,155,339],"that":[146,156,227,340],"application":[148],"behavior":[149],"can":[150],"be":[151,241,283],"well":[152],"predicted.":[153],"Results":[154],"only":[161,280],"10.17%":[162],"slower":[163],"than":[164],"optimal":[166],"partitioning.&#13;\\n&#13;\\nAnalyzing":[167],"improving":[169],"real":[173],"depends":[175],"good":[177],"information":[178,202],"about":[179],"kernel.":[182],"Thus,":[183],"additionally":[186],"considers":[187],"obtaining":[191],"such":[192,297],"information.":[193],"We":[194],"profiling":[196,228,259],"CUDA":[198],"obtain":[201],"instruction":[204,211],"counts.":[205],"A":[206],"LLVM-based":[207],"compiler":[208],"extension":[209],"providing":[210],"counts":[212],"per":[213],"kernel":[214,268],"PTX":[216],"level":[217],"proposed":[219],"evaluated.":[221],"approach":[223,260],"advantage":[226],"much":[230],"faster":[231],"compared":[232],"NVIDIAs":[234],"profiler":[235,276],"nvprof.":[236],"average":[238,343],"overhead":[239],"could":[240],"improved":[242],"factor":[245],"10":[247],"13.2":[249],"times":[250],"normal":[252],"execution":[253,320],"time.&#13;\\n&#13;\\nThe":[254],"metrics":[255,273],"new":[258],"used":[262],"develop":[264],"methodology":[266],"performance":[269,332],"prediction.":[270,325,360],"Because":[271],"GPU-independent,":[278],"they":[279],"need":[281],"measured":[284],"once,":[285],"great":[289],"advantage.":[290],"168":[291],"from":[293,347,354],"benchmark":[295],"suites":[296],"as":[298],"Parboil,":[299],"Rodinia,":[300],"Polybench-GPU":[301],"SHOC":[303],"evaluated":[305],"five":[307],"based":[312],"random":[314],"forests":[315],"built":[318],"time":[321,352],"power":[323,359],"consumption":[324],"evaluation":[327],"model":[330],"prediction":[331],"cross-validation":[335],"results":[338],"median":[342],"percentage":[344],"error":[345],"ranges":[346],"8.86":[348],"52%":[350],"1.84":[355],"2.94%":[357]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
