{"id":"https://openalex.org/W7155380284","doi":"https://doi.org/10.1145/3777884.3797012","title":"Performance Analysis and Optimization of 3D Generative Diffusion Models across GPU Architectures","display_name":"Performance Analysis and Optimization of 3D Generative Diffusion Models across GPU Architectures","publication_year":2026,"publication_date":"2026-04-23","ids":{"openalex":"https://openalex.org/W7155380284","doi":"https://doi.org/10.1145/3777884.3797012"},"language":null,"primary_location":{"id":"doi:10.1145/3777884.3797012","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777884.3797012","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3777884.3797012","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111062876","display_name":"Jeeho Ryoo","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140396","display_name":"Becton Dickinson (Canada)","ror":"https://ror.org/031dsww89","country_code":"CA","type":"company","lineage":["https://openalex.org/I146461966","https://openalex.org/I4210140396"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Jeeho Ryoo","raw_affiliation_strings":["Fairleigh Dickinson University, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0009-0003-0401-3685","affiliations":[{"raw_affiliation_string":"Fairleigh Dickinson University, Vancouver, BC, Canada","institution_ids":["https://openalex.org/I4210140396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134415900","display_name":"Yongchan Jung","orcid":"https://orcid.org/0009-0002-6504-6723"},"institutions":[{"id":"https://openalex.org/I4210140396","display_name":"Becton Dickinson (Canada)","ror":"https://ror.org/031dsww89","country_code":"CA","type":"company","lineage":["https://openalex.org/I146461966","https://openalex.org/I4210140396"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Yongchan Jung","raw_affiliation_strings":["Fairleigh Dickinson University, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0009-0002-6504-6723","affiliations":[{"raw_affiliation_string":"Fairleigh Dickinson University, Vancouver, BC, Canada","institution_ids":["https://openalex.org/I4210140396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057476679","display_name":"Muhammad Ali Khaliq","orcid":null},"institutions":[{"id":"https://openalex.org/I888729015","display_name":"University of Colorado Colorado Springs","ror":"https://ror.org/054spjc55","country_code":"US","type":"education","lineage":["https://openalex.org/I888729015"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Muhammad Ali Khaliq","raw_affiliation_strings":["University of Colorado at Colorado Springs, Colorado Springs, CO, USA"],"raw_orcid":"https://orcid.org/0009-0006-2256-7974","affiliations":[{"raw_affiliation_string":"University of Colorado at Colorado Springs, Colorado Springs, CO, USA","institution_ids":["https://openalex.org/I888729015"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134408214","display_name":"Weidong Zhang","orcid":"https://orcid.org/0009-0009-2158-1784"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Weidong Zhang","raw_affiliation_strings":["Northeastern University, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0009-0009-2158-1784","affiliations":[{"raw_affiliation_string":"Northeastern University, Vancouver, BC, Canada","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134392939","display_name":"Jiatong Han","orcid":"https://orcid.org/0009-0001-2843-1780"},"institutions":[{"id":"https://openalex.org/I4210140396","display_name":"Becton Dickinson (Canada)","ror":"https://ror.org/031dsww89","country_code":"CA","type":"company","lineage":["https://openalex.org/I146461966","https://openalex.org/I4210140396"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Jiatong Han","raw_affiliation_strings":["Fairleigh Dickinson University, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0009-0001-2843-1780","affiliations":[{"raw_affiliation_string":"Fairleigh Dickinson University, Vancouver, BC, Canada","institution_ids":["https://openalex.org/I4210140396"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5027851216","display_name":"Byeong Kil Lee","orcid":"https://orcid.org/0000-0002-0260-2238"},"institutions":[{"id":"https://openalex.org/I888729015","display_name":"University of Colorado Colorado Springs","ror":"https://ror.org/054spjc55","country_code":"US","type":"education","lineage":["https://openalex.org/I888729015"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Byeong Kil Lee","raw_affiliation_strings":["University of Colorado at Colorado Springs, Colorado Springs, CO, USA"],"raw_orcid":"https://orcid.org/0000-0002-0260-2238","affiliations":[{"raw_affiliation_string":"University of Colorado at Colorado Springs, Colorado Springs, CO, USA","institution_ids":["https://openalex.org/I888729015"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5111062876"],"corresponding_institution_ids":["https://openalex.org/I4210140396"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94671825,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"218","last_page":"229"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11304","display_name":"Advanced Neuroimaging Techniques and Applications","score":0.3447999954223633,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11304","display_name":"Advanced Neuroimaging Techniques and Applications","score":0.3447999954223633,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.1679999977350235,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11471","display_name":"Block Copolymer Self-Assembly","score":0.04439999908208847,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.6901999711990356},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5564000010490417},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.4991999864578247},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.4959999918937683},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4837000072002411},{"id":"https://openalex.org/keywords/diffusion-mri","display_name":"Diffusion MRI","score":0.41040000319480896},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.40700000524520874},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.4000999927520752}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8059999942779541},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.6901999711990356},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5564000010490417},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.4991999864578247},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.4959999918937683},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4837000072002411},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.48330000042915344},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.47690001130104065},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.41190001368522644},{"id":"https://openalex.org/C149550507","wikidata":"https://www.wikidata.org/wiki/Q899360","display_name":"Diffusion MRI","level":3,"score":0.41040000319480896},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.40700000524520874},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.4000999927520752},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3750999867916107},{"id":"https://openalex.org/C9395851","wikidata":"https://www.wikidata.org/wiki/Q177929","display_name":"Stack (abstract data type)","level":2,"score":0.34599998593330383},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3359000086784363},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.32749998569488525},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3246999979019165},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.30379998683929443},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.28850001096725464},{"id":"https://openalex.org/C2780365336","wikidata":"https://www.wikidata.org/wiki/Q25047934","display_name":"Single-core","level":2,"score":0.2750999927520752}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3777884.3797012","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777884.3797012","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3777884.3797012","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3777884.3797012","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 17th ACM/SPEC International Conference on Performance Engineering","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":46,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2464708700","https://openalex.org/W2890139949","https://openalex.org/W2983865192","https://openalex.org/W3096165964","https://openalex.org/W3129927603","https://openalex.org/W3203312420","https://openalex.org/W4213300949","https://openalex.org/W4284899685","https://openalex.org/W4296823261","https://openalex.org/W4306168172","https://openalex.org/W4308555638","https://openalex.org/W4310259937","https://openalex.org/W4385272245","https://openalex.org/W4386352890","https://openalex.org/W4386702657","https://openalex.org/W4388200034","https://openalex.org/W4388459526","https://openalex.org/W4388551943","https://openalex.org/W4390970420","https://openalex.org/W4393157170","https://openalex.org/W4393972877","https://openalex.org/W4396213301","https://openalex.org/W4396686649","https://openalex.org/W4396909894","https://openalex.org/W4398222787","https://openalex.org/W4400981456","https://openalex.org/W4401211860","https://openalex.org/W4401750764","https://openalex.org/W4401984079","https://openalex.org/W4404459777","https://openalex.org/W4404849657","https://openalex.org/W4405268957","https://openalex.org/W4406749779","https://openalex.org/W4408345761","https://openalex.org/W4409721080","https://openalex.org/W4409883006","https://openalex.org/W4409917015","https://openalex.org/W4410161985","https://openalex.org/W4410289560","https://openalex.org/W4410308676","https://openalex.org/W4413630510","https://openalex.org/W4414196737","https://openalex.org/W4414398608","https://openalex.org/W4415988375","https://openalex.org/W7103752107"],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"models":[1],"have":[2],"become":[3],"essential":[4],"for":[5],"high-fidelity":[6],"3D":[7,111],"MRI":[8],"synthesis,":[9],"yet":[10],"their":[11],"deployment":[12],"remains":[13],"constrained":[14],"by":[15,77,97,121,128,141],"substantial":[16],"GPU":[17],"resource":[18],"demands":[19],"arising":[20,85],"from":[21,86,134],"hundreds":[22],"of":[23,41,51],"U-Net":[24],"evaluations":[25],"per":[26],"sample":[27],"and":[28,66,80,91,109,114,138],"a":[29,37,110],"highly":[30],"heterogeneous":[31],"kernel":[32],"behavior.":[33],"This":[34],"paper":[35],"performs":[36],"comprehensive":[38],"performance":[39],"analysis":[40],"the":[42],"state-of-the-art":[43],"medical":[44],"diffusion":[45],"model,":[46],"Med-DDPM,":[47],"across":[48],"three":[49],"generations":[50],"NVIDIA":[52],"architectures":[53],"to":[54,123,136],"study":[55],"kernel-level":[56],"runtime":[57],"breakdowns,":[58],"instruction-mix":[59],"characteristics,":[60],"memory":[61],"system":[62],"utilization,":[63],"warp-level":[64],"activities,":[65],"profiler":[67],"priority-score":[68],"estimates.":[69],"We":[70],"show":[71],"that":[72,116],"training":[73],"is":[74],"overwhelmingly":[75],"dominated":[76],"cuDNN":[78],"convolution":[79],"implicit-GEMM":[81],"kernels,":[82],"with":[83],"inefficiencies":[84],"memory-access":[87],"patterns,":[88],"tensor-layout":[89],"conversions,":[90],"limited":[92],"Tensor":[93,106,131],"Core":[94,107,132],"utilization.":[95],"Guided":[96],"these":[98],"insights,":[99],"we":[100],"evaluate":[101],"two":[102],"architecture-aware":[103],"optimizations":[104],"TF32":[105],"activation":[108],"channels-last":[112],"layout":[113],"demonstrate":[115],"they":[117],"reduce":[118],"SM":[119],"cycles":[120],"up":[122],"100x,":[124,129],"cut":[125],"dynamic":[126],"instructions":[127],"raise":[130],"utilization":[133],"1.45":[135],"9.98x,":[137],"increase":[139],"IPC":[140],"7%":[142],"on":[143],"A100,":[144],"all":[145],"without":[146],"degrading":[147],"synthesis":[148],"quality.":[149]},"counts_by_year":[],"updated_date":"2026-04-24T06:07:52.864757","created_date":"2026-04-24T00:00:00"}
