{"id":"https://openalex.org/W3091995851","doi":"https://doi.org/10.1177/1094342020965661","title":"Performance engineering for real and complex tall &amp; skinny matrix multiplication kernels on GPUs","display_name":"Performance engineering for real and complex tall &amp; skinny matrix multiplication kernels on GPUs","publication_year":2020,"publication_date":"2020-10-09","ids":{"openalex":"https://openalex.org/W3091995851","doi":"https://doi.org/10.1177/1094342020965661","mag":"3091995851"},"language":"en","primary_location":{"id":"doi:10.1177/1094342020965661","is_oa":true,"landing_page_url":"https://doi.org/10.1177/1094342020965661","pdf_url":"https://journals.sagepub.com/doi/pdf/10.1177/1094342020965661","source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://journals.sagepub.com/doi/pdf/10.1177/1094342020965661","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5081383054","display_name":"Dominik Ernst","orcid":"https://orcid.org/0000-0003-3547-0611"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Dominik Ernst","raw_affiliation_strings":["Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082552227","display_name":"Georg Hager","orcid":"https://orcid.org/0000-0002-8723-2781"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Georg Hager","raw_affiliation_strings":["Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010377968","display_name":"Jonas Thies","orcid":"https://orcid.org/0000-0001-9231-9999"},"institutions":[{"id":"https://openalex.org/I2898391981","display_name":"Deutsches Zentrum f\u00fcr Luft- und Raumfahrt e. V. (DLR)","ror":"https://ror.org/04bwf3e34","country_code":"DE","type":"facility","lineage":["https://openalex.org/I1305996414","https://openalex.org/I2898391981"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Jonas Thies","raw_affiliation_strings":["German Aerospace Center (DLR), Simulation and Software Technology, K\u00f6ln, Germany"],"affiliations":[{"raw_affiliation_string":"German Aerospace Center (DLR), Simulation and Software Technology, K\u00f6ln, Germany","institution_ids":["https://openalex.org/I2898391981"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070209050","display_name":"Gerhard Wellein","orcid":"https://orcid.org/0000-0001-7371-3026"},"institutions":[{"id":"https://openalex.org/I181369854","display_name":"Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg","ror":"https://ror.org/00f7hpc57","country_code":"DE","type":"education","lineage":["https://openalex.org/I181369854"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Gerhard Wellein","raw_affiliation_strings":["Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany"],"affiliations":[{"raw_affiliation_string":"Erlangen Regional Computing Center (RRZE), Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg, Erlangen, Germany","institution_ids":["https://openalex.org/I181369854"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5081383054"],"corresponding_institution_ids":["https://openalex.org/I181369854"],"apc_list":null,"apc_paid":null,"fwci":1.1713,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.78045667,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":"35","issue":"1","first_page":"5","last_page":"19"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7629712224006653},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6643946170806885},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.5586933493614197},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.5230701565742493},{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.5230523347854614},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.49503082036972046},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.492664635181427},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.4887266159057617},{"id":"https://openalex.org/keywords/square-matrix","display_name":"Square matrix","score":0.4424348771572113},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.4370870590209961},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3030306100845337},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.1959415078163147},{"id":"https://openalex.org/keywords/symmetric-matrix","display_name":"Symmetric matrix","score":0.15282869338989258},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.12855175137519836},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.11024096608161926},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.07663559913635254}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7629712224006653},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6643946170806885},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.5586933493614197},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.5230701565742493},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.5230523347854614},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.49503082036972046},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.492664635181427},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.4887266159057617},{"id":"https://openalex.org/C69044650","wikidata":"https://www.wikidata.org/wiki/Q2739329","display_name":"Square matrix","level":4,"score":0.4424348771572113},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.4370870590209961},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3030306100845337},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.1959415078163147},{"id":"https://openalex.org/C54848796","wikidata":"https://www.wikidata.org/wiki/Q339011","display_name":"Symmetric matrix","level":3,"score":0.15282869338989258},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.12855175137519836},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11024096608161926},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.07663559913635254},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1177/1094342020965661","is_oa":true,"landing_page_url":"https://doi.org/10.1177/1094342020965661","pdf_url":"https://journals.sagepub.com/doi/pdf/10.1177/1094342020965661","source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},{"id":"pmh:oai:elib.dlr.de:137944","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342020965661>.","pdf_url":null,"source":{"id":"https://openalex.org/S4377196266","display_name":"elib (German Aerospace Center)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I2898391981","host_organization_name":"Deutsches Zentrum f\u00fcr Luft- und Raumfahrt e. V. (DLR)","host_organization_lineage":["https://openalex.org/I2898391981"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"},{"id":"pmh:oai:ub.uni-erlangen.de-opus:15447","is_oa":true,"landing_page_url":"https://nbn-resolving.org/urn:nbn:de:bvb:29-opus4-154471","pdf_url":null,"source":{"id":"https://openalex.org/S4306401636","display_name":"OPUS Repository (Kooperativer Bibliotheksverbund Berlin-Brandenburg)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1177/1094342020965661","is_oa":true,"landing_page_url":"https://doi.org/10.1177/1094342020965661","pdf_url":"https://journals.sagepub.com/doi/pdf/10.1177/1094342020965661","source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.44999998807907104}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3091995851.pdf","grobid_xml":"https://content.openalex.org/works/W3091995851.grobid-xml"},"referenced_works_count":17,"referenced_works":["https://openalex.org/W44020937","https://openalex.org/W2002555321","https://openalex.org/W2008698464","https://openalex.org/W2035476608","https://openalex.org/W2040425799","https://openalex.org/W2175670603","https://openalex.org/W2546792675","https://openalex.org/W2792804042","https://openalex.org/W2924040443","https://openalex.org/W2944647618","https://openalex.org/W2950034300","https://openalex.org/W3014127403","https://openalex.org/W3048668821","https://openalex.org/W3098878359","https://openalex.org/W3101665981","https://openalex.org/W4232940466","https://openalex.org/W4248316278"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2792081825","https://openalex.org/W2893308117"],"abstract_inverted_index":{"General":[0],"matrix-matrix":[1],"multiplications":[2],"with":[3,107],"double-precision":[4],"real":[5],"and":[6,10,63,83,86,96,130],"complex":[7],"entries":[8],"(DGEMM":[9],"ZGEMM)":[11],"in":[12,56,116],"vendor-supplied":[13],"BLAS":[14],"libraries":[15],"are":[16,33],"best":[17],"optimized":[18],"for":[19,27,98],"square":[20],"matrices":[21],"but":[22],"often":[23,131],"show":[24],"bad":[25],"performance":[26,49,129],"tall":[28],"&amp;":[29],"skinny":[30],"matrices,":[31],"which":[32],"much":[34],"taller":[35],"than":[36],"wide.":[37],"NVIDIA\u2019s":[38],"current":[39],"CUBLAS":[40,136],"implementation":[41,68],"delivers":[42],"only":[43],"a":[44,88,110],"fraction":[45],"of":[46,66,81,113,119,126],"the":[47,53,61,117,127],"potential":[48],"as":[50],"indicated":[51],"by":[52],"roofline":[54,128],"model":[55],"this":[57],"case.":[58],"We":[59,76],"describe":[60],"challenges":[62],"key":[64],"characteristics":[65],"an":[67,139],"that":[69],"can":[70],"achieve":[71,122],"close":[72],"to":[73],"optimal":[74],"performance.":[75],"further":[77],"evaluate":[78],"different":[79],"strategies":[80],"parallelization":[82],"thread":[84],"distribution":[85],"devise":[87],"flexible,":[89],"configurable":[90],"mapping":[91],"scheme.":[92],"To":[93],"ensure":[94],"flexibility":[95],"allow":[97],"highly":[99],"tailored":[100],"implementations":[101],"we":[102,121],"use":[103],"code":[104],"generation":[105],"combined":[106],"autotuning.":[108],"For":[109],"large":[111],"range":[112],"matrix":[114],"sizes":[115],"domain":[118],"interest":[120],"at":[123],"least":[124],"2/3":[125],"substantially":[132],"outperform":[133],"state-of-the":[134],"art":[135],"results":[137],"on":[138],"NVIDIA":[140],"Volta":[141],"GPGPU.":[142]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":2},{"year":2021,"cited_by_count":3}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
