{"id":"https://openalex.org/W2045920776","doi":"https://doi.org/10.1145/2712386.2712387","title":"Energy efficiency and performance frontiers for sparse computations on GPU supercomputers","display_name":"Energy efficiency and performance frontiers for sparse computations on GPU supercomputers","publication_year":2015,"publication_date":"2015-01-28","ids":{"openalex":"https://openalex.org/W2045920776","doi":"https://doi.org/10.1145/2712386.2712387","mag":"2045920776"},"language":"en","primary_location":{"id":"doi:10.1145/2712386.2712387","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2712386.2712387","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2712386.2712387?download=true","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Sixth International Workshop on Programming Models and Applications for Multicores and Manycores","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/2712386.2712387?download=true","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012133869","display_name":"Hartwig Anzt","orcid":"https://orcid.org/0000-0003-2177-952X"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Hartwig Anzt","raw_affiliation_strings":["University of Tennessee, Knoxville"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083604741","display_name":"Stanimire Tomov","orcid":"https://orcid.org/0000-0002-5937-7959"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stanimire Tomov","raw_affiliation_strings":["University of Tennessee, Knoxville"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075517045","display_name":"Jack Dongarra","orcid":"https://orcid.org/0000-0003-3247-1782"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jack Dongarra","raw_affiliation_strings":["University of Tennessee, Knoxville"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville","institution_ids":["https://openalex.org/I75027704"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5012133869"],"corresponding_institution_ids":["https://openalex.org/I75027704"],"apc_list":null,"apc_paid":null,"fwci":2.3082,"has_fulltext":true,"cited_by_count":12,"citation_normalized_percentile":{"value":0.87996546,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8182206153869629},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7664802074432373},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.5598985552787781},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.5594383478164673},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5576977729797363},{"id":"https://openalex.org/keywords/solver","display_name":"Solver","score":0.546224057674408},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5447091460227966},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5138757228851318},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4678923189640045},{"id":"https://openalex.org/keywords/performance-improvement","display_name":"Performance improvement","score":0.44907715916633606},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.4489974081516266},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.4396817982196808},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.43966299295425415},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.4230438768863678},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.42203885316848755},{"id":"https://openalex.org/keywords/dot-product","display_name":"Dot product","score":0.4205642640590668},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.24332746863365173},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.08039331436157227}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8182206153869629},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7664802074432373},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.5598985552787781},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.5594383478164673},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5576977729797363},{"id":"https://openalex.org/C2778770139","wikidata":"https://www.wikidata.org/wiki/Q1966904","display_name":"Solver","level":2,"score":0.546224057674408},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5447091460227966},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5138757228851318},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4678923189640045},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.44907715916633606},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4489974081516266},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4396817982196808},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.43966299295425415},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.4230438768863678},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.42203885316848755},{"id":"https://openalex.org/C32900221","wikidata":"https://www.wikidata.org/wiki/Q181365","display_name":"Dot product","level":2,"score":0.4205642640590668},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.24332746863365173},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.08039331436157227},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2712386.2712387","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2712386.2712387","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2712386.2712387?download=true","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Sixth International Workshop on Programming Models and Applications for Multicores and Manycores","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/2712386.2712387","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2712386.2712387","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2712386.2712387?download=true","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Sixth International Workshop on Programming Models and Applications for Multicores and Manycores","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy","score":0.9100000262260437}],"awards":[{"id":"https://openalex.org/G1351316705","display_name":null,"funder_award_id":"DE-SC0010042","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G1687433189","display_name":null,"funder_award_id":"1339822","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G1836428231","display_name":null,"funder_award_id":"1339822","funder_id":"https://openalex.org/F4320309480","funder_display_name":"Nvidia"},{"id":"https://openalex.org/G7262430608","display_name":null,"funder_award_id":"ACI-1339822","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8753669605","display_name":null,"funder_award_id":"N14-11-00190","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8870292234","display_name":null,"funder_award_id":"Grant No. ACI-1339822","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2045920776.pdf","grobid_xml":"https://content.openalex.org/works/W2045920776.grobid-xml"},"referenced_works_count":52,"referenced_works":["https://openalex.org/W60615445","https://openalex.org/W92933111","https://openalex.org/W98110951","https://openalex.org/W158669756","https://openalex.org/W164867175","https://openalex.org/W1252105715","https://openalex.org/W1575701986","https://openalex.org/W1584326888","https://openalex.org/W1586414899","https://openalex.org/W1588915715","https://openalex.org/W1754016561","https://openalex.org/W1907157228","https://openalex.org/W1967501281","https://openalex.org/W1977146182","https://openalex.org/W1981745143","https://openalex.org/W1990660311","https://openalex.org/W2035223648","https://openalex.org/W2038122178","https://openalex.org/W2040436854","https://openalex.org/W2045616426","https://openalex.org/W2051142108","https://openalex.org/W2053107666","https://openalex.org/W2055525383","https://openalex.org/W2060812780","https://openalex.org/W2086666883","https://openalex.org/W2088213738","https://openalex.org/W2089711159","https://openalex.org/W2099021415","https://openalex.org/W2100846455","https://openalex.org/W2103069304","https://openalex.org/W2105524676","https://openalex.org/W2111221242","https://openalex.org/W2120383759","https://openalex.org/W2124007994","https://openalex.org/W2134573462","https://openalex.org/W2139116943","https://openalex.org/W2144923127","https://openalex.org/W2154976136","https://openalex.org/W2259574154","https://openalex.org/W2262570173","https://openalex.org/W2342748186","https://openalex.org/W2350991510","https://openalex.org/W2518567779","https://openalex.org/W2618281564","https://openalex.org/W2798909945","https://openalex.org/W3005347330","https://openalex.org/W3102398037","https://openalex.org/W4229666556","https://openalex.org/W4285719527","https://openalex.org/W4406095107","https://openalex.org/W6705553805","https://openalex.org/W7008672335"],"related_works":["https://openalex.org/W4288634132","https://openalex.org/W4311266784","https://openalex.org/W2981978625","https://openalex.org/W1985269159","https://openalex.org/W2293771254","https://openalex.org/W2608151934","https://openalex.org/W4381050447","https://openalex.org/W3121314575","https://openalex.org/W1835670156","https://openalex.org/W2094139070"],"abstract_inverted_index":{"In":[0,87,204],"this":[1,51,178],"paper":[2],"we":[3,20,117,156,207],"unveil":[4],"some":[5],"energy":[6,132,230,239],"efficiency":[7,240],"and":[8,35,37,61,123,128,131,172,232,241],"performance":[9,82,104,130,166,223,242],"frontiers":[10,237],"for":[11,50,68,84,119,243],"sparse":[12,26,60,141,244],"computations":[13,245],"on":[14,177,246],"GPU-based":[15],"supercomputers.":[16,247],"To":[17],"do":[18],"this,":[19],"consider":[21],"state-of-the-art":[22],"implementations":[23,213],"of":[24,59,108,150,153,235],"the":[25,41,114,120,124,144,151,159,169,173,189],"matrix-vector":[27],"(SpMV)":[28],"product":[29,98],"in":[30,40,229,238],"libraries":[31],"like":[32,143],"cuSPARSE,":[33],"MKL,":[34],"MAGMA,":[36],"their":[38],"use":[39,118],"LOBPCG":[42,44,73,125,175],"eigen-solver.":[43],"is":[45,79,180],"chosen":[46],"as":[47,53],"a":[48,75,80,96,106,139,148,164,194],"benchmark":[49],"study":[52,129],"it":[54],"combines":[55],"an":[56],"interesting":[57],"mix":[58],"dense":[62],"linear":[63],"algebra":[64],"operations":[65,92],"with":[66,188],"potential":[67],"hardware-aware":[69],"optimizations.":[70],"Most":[71],"notably,":[72],"includes":[74],"blocking":[76],"technique":[77],"that":[78,100,158,209],"common":[81],"optimization":[83,220],"many":[85],"applications.":[86],"particular,":[88],"multiple":[89],"memory-bound":[90],"SpMV":[91,145],"are":[93,214,233],"blocked":[94],"into":[95],"SpM-matrix":[97],"(SpMM),":[99],"achieves":[101,161],"significantly":[102],"higher":[103],"than":[105,185],"sequence":[107],"SpMVs.":[109],"We":[110],"provide":[111],"details":[112],"about":[113],"GPU":[115,196],"kernels":[116],"SpMV,":[121,171],"SpMM,":[122],"implementation":[126],"design,":[127],"consumption":[133],"compared":[134],"to":[135,163,182,218,226],"CPU":[136,212],"solutions.":[137],"While":[138],"typical":[140],"computation":[142],"reaches":[146],"only":[147],"fraction":[149],"peak":[152],"current":[154],"GPUs,":[155],"show":[157,208],"SpMM":[160],"up":[162],"6x":[165],"improvement":[167],"over":[168],"GPU's":[170],"GPU-accelerated":[174],"based":[176],"kernel":[179],"3":[181],"5x":[183],"faster":[184],"multicore":[186],"CPUs":[187,201],"same":[190],"power":[191],"draw,":[192],"e.g.,":[193],"K40":[195],"vs.":[197],"two":[198],"Sandy":[199],"Bridge":[200],"(16":[202],"cores).":[203],"practice":[205],"though,":[206],"currently":[210],"available":[211],"much":[215],"slower":[216],"due":[217],"missed":[219],"opportunities.":[221],"These":[222],"results":[224],"translate":[225],"similar":[227],"improvements":[228],"consumption,":[231],"indicative":[234],"today's":[236]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":4},{"year":2015,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
