{"id":"https://openalex.org/W2775348209","doi":"https://doi.org/10.1109/tpds.2017.2783929","title":"A Guide for Achieving High Performance with Very Small Matrices on GPU: A Case Study of Batched LU and Cholesky Factorizations","display_name":"A Guide for Achieving High Performance with Very Small Matrices on GPU: A Case Study of Batched LU and Cholesky Factorizations","publication_year":2017,"publication_date":"2017-12-15","ids":{"openalex":"https://openalex.org/W2775348209","doi":"https://doi.org/10.1109/tpds.2017.2783929","mag":"2775348209"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2017.2783929","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2017.2783929","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://research.manchester.ac.uk/en/publications/1809858e-bda5-4a8e-a59c-ace79b847c53","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101964224","display_name":"Azzam Haidar","orcid":"https://orcid.org/0000-0002-3177-2084"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Azzam Haidar","raw_affiliation_strings":["Innovative Computing Laboratory, University of Tennessee, Knoxville, TN"],"affiliations":[{"raw_affiliation_string":"Innovative Computing Laboratory, University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077268543","display_name":"Ahmad Abdelfattah","orcid":"https://orcid.org/0000-0001-5054-4784"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ahmad Abdelfattah","raw_affiliation_strings":["Innovative Computing Laboratory, University of Tennessee, Knoxville, TN"],"affiliations":[{"raw_affiliation_string":"Innovative Computing Laboratory, University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079445612","display_name":"Mawussi Zounon","orcid":"https://orcid.org/0000-0002-6955-1500"},"institutions":[{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mawussi Zounon","raw_affiliation_strings":["University of Manchester, Manchester, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Manchester, Manchester, United Kingdom","institution_ids":["https://openalex.org/I28407311"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083604741","display_name":"Stanimire Tomov","orcid":"https://orcid.org/0000-0002-5937-7959"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stanimire Tomov","raw_affiliation_strings":["Innovative Computing Laboratory, University of Tennessee, Knoxville, TN"],"affiliations":[{"raw_affiliation_string":"Innovative Computing Laboratory, University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075517045","display_name":"Jack Dongarra","orcid":"https://orcid.org/0000-0003-3247-1782"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]},{"id":"https://openalex.org/I28407311","display_name":"University of Manchester","ror":"https://ror.org/027m9bs27","country_code":"GB","type":"education","lineage":["https://openalex.org/I28407311"]},{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Jack Dongarra","raw_affiliation_strings":["Innovative Computing Laboratory, University of Tennessee, Knoxville, TN","Oak Ridge National Laboratory, Oak Ridge, TN","University of Manchester, Manchester, United Kingdom"],"affiliations":[{"raw_affiliation_string":"Innovative Computing Laboratory, University of Tennessee, Knoxville, TN","institution_ids":["https://openalex.org/I75027704"]},{"raw_affiliation_string":"Oak Ridge National Laboratory, Oak Ridge, TN","institution_ids":["https://openalex.org/I1289243028"]},{"raw_affiliation_string":"University of Manchester, Manchester, United Kingdom","institution_ids":["https://openalex.org/I28407311"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101964224"],"corresponding_institution_ids":["https://openalex.org/I75027704"],"apc_list":null,"apc_paid":null,"fwci":2.7739,"has_fulltext":false,"cited_by_count":28,"citation_normalized_percentile":{"value":0.92341357,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":"29","issue":"5","first_page":"973","last_page":"984"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.9965999722480774,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cholesky-decomposition","display_name":"Cholesky decomposition","score":0.941096305847168},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8656538128852844},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7519755363464355},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6853190064430237},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.547387957572937},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.5365773439407349},{"id":"https://openalex.org/keywords/lu-decomposition","display_name":"LU decomposition","score":0.4823841154575348},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.44189879298210144},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.43645548820495605},{"id":"https://openalex.org/keywords/pascal","display_name":"Pascal (unit)","score":0.42681753635406494},{"id":"https://openalex.org/keywords/matrix-decomposition","display_name":"Matrix decomposition","score":0.33861470222473145},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09009960293769836},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.08251309394836426}],"concepts":[{"id":"https://openalex.org/C34727166","wikidata":"https://www.wikidata.org/wiki/Q515375","display_name":"Cholesky decomposition","level":3,"score":0.941096305847168},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8656538128852844},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7519755363464355},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6853190064430237},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.547387957572937},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.5365773439407349},{"id":"https://openalex.org/C123213974","wikidata":"https://www.wikidata.org/wiki/Q833089","display_name":"LU decomposition","level":4,"score":0.4823841154575348},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.44189879298210144},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.43645548820495605},{"id":"https://openalex.org/C75608658","wikidata":"https://www.wikidata.org/wiki/Q44395","display_name":"Pascal (unit)","level":2,"score":0.42681753635406494},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.33861470222473145},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09009960293769836},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.08251309394836426},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tpds.2017.2783929","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2017.2783929","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},{"id":"pmh:oai:pure.atira.dk:openaire/1809858e-bda5-4a8e-a59c-ace79b847c53","is_oa":true,"landing_page_url":"https://research.manchester.ac.uk/en/publications/1809858e-bda5-4a8e-a59c-ace79b847c53","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Haidar, A, Abdelfattah, A, Zounon, M, Tomov, S & Dongarra, J 2017, 'A Guide For Achieving High Performance With Very Small Matrices On GPU : A case Study of Batched LU and Cholesky Factorizations', IEEE Transactions on Parallel and Distributed Systems. https://doi.org/10.1109/TPDS.2017.2783929","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:pure.atira.dk:publications/1809858e-bda5-4a8e-a59c-ace79b847c53","is_oa":false,"landing_page_url":"http://www.scopus.com/inward/record.url?scp=85038871819&partnerID=8YFLogxK","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":{"id":"pmh:oai:pure.atira.dk:openaire/1809858e-bda5-4a8e-a59c-ace79b847c53","is_oa":true,"landing_page_url":"https://research.manchester.ac.uk/en/publications/1809858e-bda5-4a8e-a59c-ace79b847c53","pdf_url":null,"source":{"id":"https://openalex.org/S4306400662","display_name":"Research Explorer (The University of Manchester)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I28407311","host_organization_name":"University of Manchester","host_organization_lineage":["https://openalex.org/I28407311"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Haidar, A, Abdelfattah, A, Zounon, M, Tomov, S & Dongarra, J 2017, 'A Guide For Achieving High Performance With Very Small Matrices On GPU : A case Study of Batched LU and Cholesky Factorizations', IEEE Transactions on Parallel and Distributed Systems. https://doi.org/10.1109/TPDS.2017.2783929","raw_type":"info:eu-repo/semantics/publishedVersion"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1500703584","display_name":"SI2:SSE: MAtrix, TEnsor, and Deep-Learning Optimized Routines (MATEDOR)","funder_award_id":"1740250","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G7890925933","display_name":null,"funder_award_id":"1514286","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8084533499","display_name":null,"funder_award_id":"OAC 1740250","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8272958095","display_name":null,"funder_award_id":"CSR 1514286","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G848032724","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W48467709","https://openalex.org/W1004371317","https://openalex.org/W1539904802","https://openalex.org/W1624860839","https://openalex.org/W1667652561","https://openalex.org/W1737435842","https://openalex.org/W1897240848","https://openalex.org/W1965554802","https://openalex.org/W1981812667","https://openalex.org/W1985691044","https://openalex.org/W1997033059","https://openalex.org/W2002555321","https://openalex.org/W2006682733","https://openalex.org/W2021918216","https://openalex.org/W2028425870","https://openalex.org/W2029148980","https://openalex.org/W2054658678","https://openalex.org/W2127906479","https://openalex.org/W2129170915","https://openalex.org/W2150756254","https://openalex.org/W2162322364","https://openalex.org/W2169150754","https://openalex.org/W2187691968","https://openalex.org/W2288670848","https://openalex.org/W2340769696","https://openalex.org/W2528907418","https://openalex.org/W2621991888","https://openalex.org/W2622428623","https://openalex.org/W2729594189","https://openalex.org/W2741554038","https://openalex.org/W2963392252","https://openalex.org/W2964105097","https://openalex.org/W3100638071","https://openalex.org/W4302296459","https://openalex.org/W6601929215","https://openalex.org/W6632352690","https://openalex.org/W6637151318","https://openalex.org/W6684863423","https://openalex.org/W6687000134","https://openalex.org/W6704553824","https://openalex.org/W6742595296"],"related_works":["https://openalex.org/W4224251700","https://openalex.org/W2526784484","https://openalex.org/W4288072583","https://openalex.org/W2100843445","https://openalex.org/W2025695688","https://openalex.org/W3149476094","https://openalex.org/W3208993877","https://openalex.org/W4246966070","https://openalex.org/W1894894874","https://openalex.org/W2127144731"],"abstract_inverted_index":{"We":[0,38,62],"present":[1],"a":[2,7,49,59,64,68,98],"high-performance":[3,71],"GPU":[4,32,120,178],"kernel":[5],"with":[6],"substantial":[8],"speedup":[9],"over":[10],"vendor":[11],"libraries":[12],"for":[13,34,53,70,74,116,122,151,161,189],"very":[14,75],"small":[15,35,76],"matrix":[16,36],"computations.":[17],"In":[18],"addition,":[19],"we":[20,81],"discuss":[21],"most":[22],"of":[23,30,48,106,125,127,129,185],"the":[24,28,45,55,83,91,102,107,148,190],"challenges":[25],"that":[26],"hinder":[27],"design":[29],"efficient":[31,138],"kernels":[33,121,144,153,179],"algorithms.":[37],"propose":[39],"relevant":[40],"algorithm":[41],"analysis":[42,66],"to":[43,96,101,187],"harness":[44],"full":[46],"power":[47],"GPU,":[50],"and":[51,67,85,88,112,132,141,167],"strategies":[52],"predicting":[54],"performance,":[56],"before":[57],"introducing":[58],"proper":[60],"implementation.":[61],"develop":[63],"theoretical":[65,103],"methodology":[69,93],"linear":[72],"solvers":[73],"matrices.":[77],"As":[78],"test":[79],"cases,":[80],"take":[82],"Cholesky":[84,131,140],"LU":[86,133,143],"factorizations":[87],"show":[89],"how":[90],"proposed":[92,174,177],"enables":[94],"us":[95],"achieve":[97,180],"performance":[99,181],"close":[100],"upper":[104],"bound":[105],"hardware.":[108],"This":[109],"work":[110],"investigates":[111],"proposes":[113],"novel":[114],"algorithms":[115],"designing":[117],"highly":[118],"optimized":[119],"solving":[123],"batches":[124],"hundreds":[126],"thousands":[128],"small-size":[130],"factorizations.":[134],"Our":[135],"focus":[136],"on":[137,196],"batched":[139,142],"is":[145],"motivated":[146],"by":[147],"increasing":[149],"need":[150],"these":[152],"in":[154,172],"scientific":[155],"simulations":[156],"(e.g.,":[157],"astrophysics":[158],"applications).":[159],"Techniques":[160],"optimal":[162],"memory":[163],"traffic,":[164],"register":[165],"blocking,":[166],"tunable":[168],"concurrency":[169],"are":[170],"incorporated":[171],"our":[173],"design.":[175],"The":[176],"speedups":[182],"versus":[183],"CUBLAS":[184],"up":[186],"6\u00d7":[188],"factorizations,":[191],"using":[192],"double":[193],"precision":[194],"arithmetic":[195],"an":[197],"NVIDIA":[198],"Pascal":[199],"P100":[200],"GPU.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":7},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":5},{"year":2018,"cited_by_count":4}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
