{"id":"https://openalex.org/W2099021415","doi":"https://doi.org/10.1177/1094342010385729","title":"An Improved Magma Gemm For Fermi Graphics Processing Units","display_name":"An Improved Magma Gemm For Fermi Graphics Processing Units","publication_year":2010,"publication_date":"2010-11-01","ids":{"openalex":"https://openalex.org/W2099021415","doi":"https://doi.org/10.1177/1094342010385729","mag":"2099021415"},"language":"en","primary_location":{"id":"doi:10.1177/1094342010385729","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342010385729","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109721685","display_name":"Rajib Nath","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajib Nath","raw_affiliation_strings":["University of Tennassee, USA","University of Tennassee, USA#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Tennassee, USA","institution_ids":[]},{"raw_affiliation_string":"University of Tennassee, USA#TAB#","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083604741","display_name":"Stanimire Tomov","orcid":"https://orcid.org/0000-0002-5937-7959"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Stanimire Tomov","raw_affiliation_strings":["University of Tennassee, USA,","University of Tennassee, USA#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Tennassee, USA,","institution_ids":[]},{"raw_affiliation_string":"University of Tennassee, USA#TAB#","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075517045","display_name":"Jack Dongarra","orcid":"https://orcid.org/0000-0003-3247-1782"},"institutions":[{"id":"https://openalex.org/I1289243028","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56","country_code":"US","type":"facility","lineage":["https://openalex.org/I1289243028","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I4210159294"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jack Dongarra","raw_affiliation_strings":["University of Tennassee, USA, Oak Ridge National Laboratory, USA, University Of Manchester, UK"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Tennassee, USA, Oak Ridge National Laboratory, USA, University Of Manchester, UK","institution_ids":["https://openalex.org/I1289243028"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5083604741"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":11.1254,"has_fulltext":false,"cited_by_count":188,"citation_normalized_percentile":{"value":0.98670139,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":"24","issue":"4","first_page":"511","last_page":"515"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7552105784416199},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7059483528137207},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.6684555411338806},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.6386495232582092},{"id":"https://openalex.org/keywords/double-precision-floating-point-format","display_name":"Double-precision floating-point format","score":0.5982844233512878},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5584768056869507},{"id":"https://openalex.org/keywords/memory-hierarchy","display_name":"Memory hierarchy","score":0.5244434475898743},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.49868321418762207},{"id":"https://openalex.org/keywords/magma","display_name":"Magma","score":0.4868744909763336},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.48466116189956665},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.4843217432498932},{"id":"https://openalex.org/keywords/fermi-gamma-ray-space-telescope","display_name":"Fermi Gamma-ray Space Telescope","score":0.46195507049560547},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.43111634254455566},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.42577171325683594},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.34198886156082153},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.2664651870727539},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.20405703783035278},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.16519016027450562},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.11571714282035828},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.10632213950157166},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.07768917083740234},{"id":"https://openalex.org/keywords/combinatorics","display_name":"Combinatorics","score":0.07331234216690063}],"concepts":[{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7552105784416199},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7059483528137207},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.6684555411338806},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.6386495232582092},{"id":"https://openalex.org/C35912277","wikidata":"https://www.wikidata.org/wiki/Q1243369","display_name":"Double-precision floating-point format","level":3,"score":0.5982844233512878},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5584768056869507},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.5244434475898743},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.49868321418762207},{"id":"https://openalex.org/C183222429","wikidata":"https://www.wikidata.org/wiki/Q42278","display_name":"Magma","level":3,"score":0.4868744909763336},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.48466116189956665},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.4843217432498932},{"id":"https://openalex.org/C186769553","wikidata":"https://www.wikidata.org/wiki/Q726648","display_name":"Fermi Gamma-ray Space Telescope","level":2,"score":0.46195507049560547},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.43111634254455566},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.42577171325683594},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.34198886156082153},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.2664651870727539},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.20405703783035278},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.16519016027450562},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.11571714282035828},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.10632213950157166},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.07768917083740234},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.07331234216690063},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C165205528","wikidata":"https://www.wikidata.org/wiki/Q83371","display_name":"Seismology","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0},{"id":"https://openalex.org/C120806208","wikidata":"https://www.wikidata.org/wiki/Q8072","display_name":"Volcano","level":2,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1177/1094342010385729","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342010385729","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320337373","display_name":"Center for Information Technology","ror":"https://ror.org/03jh5a977"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W1863336885","https://openalex.org/W1964031104","https://openalex.org/W2063186542","https://openalex.org/W2139774022","https://openalex.org/W2146406121","https://openalex.org/W2146959317","https://openalex.org/W3141650078","https://openalex.org/W4229666556","https://openalex.org/W6681665333"],"related_works":["https://openalex.org/W2769189194","https://openalex.org/W2020279179","https://openalex.org/W1537323515","https://openalex.org/W2353852602","https://openalex.org/W2120249721","https://openalex.org/W1493347376","https://openalex.org/W4293390906","https://openalex.org/W1998492046","https://openalex.org/W3147497457","https://openalex.org/W3121314575"],"abstract_inverted_index":{"We":[0,31,98],"present":[1],"an":[2],"improved":[3,65,101],"matrix\u2014matrix":[4],"multiplication":[5],"routine":[6],"(General":[7],"Matrix":[8],"Multiply":[9],"[GEMM])":[10],"in":[11,41,73,81,108],"the":[12,18,36,50,95,100,104,114,117,129],"MAGMA":[13,38],"BLAS":[14],"library":[15],"that":[16],"targets":[17],"NVIDIA":[19],"Fermi":[20],"graphics":[21],"processing":[22],"units":[23],"(GPUs)":[24],"using":[25],"Compute":[26],"Unified":[27],"Data":[28],"Architecture":[29],"(CUDA).":[30],"show":[32,113],"how":[33],"to":[34,43,70,78],"modify":[35],"previous":[37],"GEMM":[39],"kernels":[40,66,102,119],"order":[42],"make":[44],"a":[45,86],"more":[46],"efficient":[47],"use":[48],"of":[49,94,116],"Fermi\u2019s":[51],"new":[52,118],"architectural":[53],"features,":[54],"most":[55],"notably":[56],"their":[57,135],"extended":[58],"memory":[59,62],"hierarchy":[60],"and":[61,76,92,133],"sizes.":[63],"The":[64],"run":[67],"at":[68],"up":[69,77],"300":[71],"GFlop/s":[72,80],"double":[74],"precision":[75,83],"645":[79],"single":[82],"arithmetic":[84],"(on":[85],"C2050),":[87],"which":[88],"is":[89],"correspondingly":[90],"58%":[91],"63%":[93],"theoretical":[96],"peak.":[97],"compare":[99,134],"with":[103,137],"currently":[105,139],"available":[106,140],"version":[107],"CUBLAS":[109],"3.1.":[110],"Further,":[111],"we":[112],"effect":[115],"on":[120,143],"higher-level":[121],"dense":[122],"linear":[123],"algebra":[124],"(DLA)":[125],"routines":[126,141],"such":[127],"as":[128],"one-sided":[130],"matrix":[131],"factorizations,":[132],"performances":[136],"corresponding,":[138],"running":[142],"homogeneous":[144],"multicore":[145],"systems.":[146]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":7},{"year":2022,"cited_by_count":13},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":12},{"year":2019,"cited_by_count":11},{"year":2018,"cited_by_count":15},{"year":2017,"cited_by_count":16},{"year":2016,"cited_by_count":19},{"year":2015,"cited_by_count":15},{"year":2014,"cited_by_count":19},{"year":2013,"cited_by_count":15},{"year":2012,"cited_by_count":23}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
