{"id":"https://openalex.org/W3195186037","doi":"https://doi.org/10.1109/tpds.2021.3094169","title":"An Automated Tool for Analysis and Tuning of GPU-Accelerated Code in HPC Applications","display_name":"An Automated Tool for Analysis and Tuning of GPU-Accelerated Code in HPC Applications","publication_year":2021,"publication_date":"2021-07-01","ids":{"openalex":"https://openalex.org/W3195186037","doi":"https://doi.org/10.1109/tpds.2021.3094169","mag":"3195186037"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2021.3094169","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3094169","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063326523","display_name":"Keren Zhou","orcid":"https://orcid.org/0000-0002-7977-3182"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Keren Zhou","raw_affiliation_strings":["Computer Science Department, Rice University, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Rice University, Houston, TX, USA","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027198310","display_name":"Xiaozhu Meng","orcid":"https://orcid.org/0000-0003-3716-9072"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaozhu Meng","raw_affiliation_strings":["Computer Science Department, Rice University, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Rice University, Houston, TX, USA","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047472245","display_name":"Ryuichi Sai","orcid":"https://orcid.org/0000-0001-8372-401X"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ryuichi Sai","raw_affiliation_strings":["Computer Science Department, Rice University, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Rice University, Houston, TX, USA","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006705737","display_name":"Dejan Grubisic","orcid":"https://orcid.org/0000-0003-3336-0726"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dejan Grubisic","raw_affiliation_strings":["Computer Science Department, Rice University, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Rice University, Houston, TX, USA","institution_ids":["https://openalex.org/I74775410"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089709469","display_name":"John Mellor\u2010Crummey","orcid":"https://orcid.org/0000-0002-9026-5453"},"institutions":[{"id":"https://openalex.org/I74775410","display_name":"Rice University","ror":"https://ror.org/008zs3103","country_code":"US","type":"education","lineage":["https://openalex.org/I74775410"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John Mellor-Crummey","raw_affiliation_strings":["Computer Science Department, Rice University, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Computer Science Department, Rice University, Houston, TX, USA","institution_ids":["https://openalex.org/I74775410"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5063326523"],"corresponding_institution_ids":["https://openalex.org/I74775410"],"apc_list":null,"apc_paid":null,"fwci":1.6424,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.83212579,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":"33","issue":"4","first_page":"854","last_page":"865"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8513329029083252},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6771113276481628},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5555814504623413},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5527125597000122},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.490790456533432},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.42672866582870483},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4172682464122772},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.38750267028808594},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2594320774078369},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.2319205403327942},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.17397090792655945}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8513329029083252},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6771113276481628},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5555814504623413},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5527125597000122},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.490790456533432},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.42672866582870483},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4172682464122772},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.38750267028808594},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2594320774078369},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.2319205403327942},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.17397090792655945},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2021.3094169","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3094169","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy","score":0.5899999737739563}],"awards":[{"id":"https://openalex.org/G7768399475","display_name":null,"funder_award_id":"B639429","funder_id":"https://openalex.org/F4320338286","funder_display_name":"Lawrence Livermore National Laboratory"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332369","display_name":"National Nuclear Security Administration","ror":"https://ror.org/03sk1we31"},{"id":"https://openalex.org/F4320338286","display_name":"Lawrence Livermore National Laboratory","ror":"https://ror.org/041nk4h53"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W960901134","https://openalex.org/W1507654557","https://openalex.org/W1517652255","https://openalex.org/W1575775940","https://openalex.org/W1902930330","https://openalex.org/W2043218878","https://openalex.org/W2067673417","https://openalex.org/W2073727874","https://openalex.org/W2080592089","https://openalex.org/W2098290747","https://openalex.org/W2101778912","https://openalex.org/W2136434791","https://openalex.org/W2140384194","https://openalex.org/W2170396259","https://openalex.org/W2270027636","https://openalex.org/W2767346422","https://openalex.org/W2789572737","https://openalex.org/W2796649226","https://openalex.org/W2902041647","https://openalex.org/W2944853139","https://openalex.org/W2979340153","https://openalex.org/W2999812057","https://openalex.org/W3007697284","https://openalex.org/W3039972296","https://openalex.org/W3040626038","https://openalex.org/W3040971560","https://openalex.org/W3128129736","https://openalex.org/W3132094421","https://openalex.org/W3136358373","https://openalex.org/W4237978577","https://openalex.org/W4238826819","https://openalex.org/W4245644062","https://openalex.org/W4249818463","https://openalex.org/W6625334730","https://openalex.org/W6634454564","https://openalex.org/W6748629087","https://openalex.org/W6750448596","https://openalex.org/W6762337438","https://openalex.org/W6780705517","https://openalex.org/W6783091751","https://openalex.org/W6790308878","https://openalex.org/W6790414588"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2792081825","https://openalex.org/W2893308117"],"abstract_inverted_index":{"The":[0],"US":[1],"Department":[2],"of":[3,21,27,78,106,112,147,207,227,233,248],"Energy\u2019s":[4],"fastest":[5],"supercomputers":[6],"and":[7,84,100,155,186,203,212],"forthcoming":[8],"exascale":[9],"systems":[10,40],"employ":[11],"Graphics":[12],"Processing":[13],"Units":[14],"(GPUs)":[15],"to":[16,34,92,103,133,140,159,173,201],"increase":[17],"the":[18,25,58,88,110,156,175,225],"computational":[19],"performance":[20,37,47,68,126,171,223],"compute":[22],"nodes.":[23],"However,":[24],"complexity":[26],"GPU":[28,50,107,115,157,196],"architectures":[29],"makes":[30],"tailoring":[31],"sophisticated":[32],"applications":[33,187],"achieve":[35],"high":[36],"on":[38,121,209],"GPU-accelerated":[39],"a":[41,67,76,152,205,230,245],"major":[42],"challenge.":[43],"At":[44,109],"best,":[45],"prior":[46],"tools":[48],"for":[49,194],"code":[51,73],"only":[52,119],"provide":[53],"coarse-grained":[54],"tuning":[55,195],"advice":[56,193],"at":[57,75],"kernel":[59],"level.":[60],"In":[61],"this":[62,113],"article,":[63],"we":[64],"describe":[65],"GPA,":[66],"advisor":[69],"that":[70,163,178,189,218],"suggests":[71],"potential":[72,176],"optimizations":[74,217,237],"hierarchy":[77],"levels,":[79],"including":[80],"individual":[81],"lines,":[82],"loops,":[83],"functions.":[85],"To":[86,124],"gather":[87],"fine-grained":[89],"measurements":[90],"needed":[91],"produce":[93],"such":[94],"insights,":[95],"GPA":[96,128,143,167,190,200,215,240],"uses":[97,129],"instruction":[98,116,137],"sampling":[99,117],"binary":[101],"instrumentation":[102],"monitor":[104],"execution":[105],"code.":[108,197],"time":[111],"writing,":[114],"is":[118],"available":[120],"NVIDIA":[122,210],"GPUs.":[123,214],"understand":[125],"losses,":[127],"data":[130],"flow":[131],"analysis":[132],"approximately":[134],"attribute":[135],"measured":[136],"stalls":[138,148],"back":[139],"their":[141],"causes.":[142],"then":[144,168],"analyzes":[145],"patterns":[146],"using":[149],"information":[150],"about":[151],"program\u2019s":[153],"structure":[154],"architecture":[158],"identify":[160],"optimization":[161,180],"strategies":[162],"address":[164],"inefficiencies":[165],"observed.":[166],"employs":[169],"detailed":[170],"models":[172],"estimate":[174],"speedup":[177],"each":[179],"might":[181],"provide.":[182],"Experiments":[183],"with":[184],"benchmarks":[185],"show":[188],"provides":[191],"useful":[192],"We":[198],"applied":[199],"analyze":[202],"tune":[204],"collection":[206],"codes":[208,228,243],"V100":[211],"A100":[213],"suggested":[216,238],"it":[219],"estimates":[220],"will":[221],"accelerate":[222],"across":[224],"set":[226],"by":[229,239,244],"geometric":[231,246],"mean":[232,247],"1.21\u00d7.":[234],"Applying":[235],"these":[236,242],"accelerated":[241],"1.19\u00d7.":[249]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
