{"id":"https://openalex.org/W2122570236","doi":"https://doi.org/10.1145/2304576.2304625","title":"Enabling and scaling matrix computations on heterogeneous multi-core and multi-GPU systems","display_name":"Enabling and scaling matrix computations on heterogeneous multi-core and multi-GPU systems","publication_year":2012,"publication_date":"2012-06-25","ids":{"openalex":"https://openalex.org/W2122570236","doi":"https://doi.org/10.1145/2304576.2304625","mag":"2122570236"},"language":"en","primary_location":{"id":"doi:10.1145/2304576.2304625","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304625","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5032742296","display_name":"Fengguang Song","orcid":"https://orcid.org/0000-0001-7382-093X"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Fengguang Song","raw_affiliation_strings":["University of Tennessee, Knoxville, TN, USA"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN, USA","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083604741","display_name":"Stanimire Tomov","orcid":"https://orcid.org/0000-0002-5937-7959"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stanimire Tomov","raw_affiliation_strings":["University of Tennessee, Knoxville, TN, USA"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN, USA","institution_ids":["https://openalex.org/I75027704"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075517045","display_name":"Jack Dongarra","orcid":"https://orcid.org/0000-0003-3247-1782"},"institutions":[{"id":"https://openalex.org/I75027704","display_name":"University of Tennessee at Knoxville","ror":"https://ror.org/020f3ap87","country_code":"US","type":"education","lineage":["https://openalex.org/I75027704"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jack Dongarra","raw_affiliation_strings":["University of Tennessee, Knoxville, TN, USA"],"affiliations":[{"raw_affiliation_string":"University of Tennessee, Knoxville, TN, USA","institution_ids":["https://openalex.org/I75027704"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5032742296"],"corresponding_institution_ids":["https://openalex.org/I75027704"],"apc_list":null,"apc_paid":null,"fwci":14.0529,"has_fulltext":false,"cited_by_count":85,"citation_normalized_percentile":{"value":0.99285456,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"365","last_page":"376"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8659422397613525},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8129427433013916},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6520929932594299},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.6025890111923218},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.584736704826355},{"id":"https://openalex.org/keywords/cholesky-decomposition","display_name":"Cholesky decomposition","score":0.5360228419303894},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4861881136894226},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.47615906596183777},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.44626492261886597},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.4383874237537384},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.43061354756355286},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.41854581236839294},{"id":"https://openalex.org/keywords/heterogeneous-network","display_name":"Heterogeneous network","score":0.41047054529190063},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.37669748067855835},{"id":"https://openalex.org/keywords/wireless","display_name":"Wireless","score":0.13069462776184082},{"id":"https://openalex.org/keywords/grid","display_name":"Grid","score":0.12694582343101501},{"id":"https://openalex.org/keywords/wireless-network","display_name":"Wireless network","score":0.11320069432258606}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8659422397613525},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8129427433013916},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6520929932594299},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.6025890111923218},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.584736704826355},{"id":"https://openalex.org/C34727166","wikidata":"https://www.wikidata.org/wiki/Q515375","display_name":"Cholesky decomposition","level":3,"score":0.5360228419303894},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4861881136894226},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.47615906596183777},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.44626492261886597},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.4383874237537384},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.43061354756355286},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.41854581236839294},{"id":"https://openalex.org/C158207573","wikidata":"https://www.wikidata.org/wiki/Q5747224","display_name":"Heterogeneous network","level":4,"score":0.41047054529190063},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.37669748067855835},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.13069462776184082},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.12694582343101501},{"id":"https://openalex.org/C108037233","wikidata":"https://www.wikidata.org/wiki/Q11375","display_name":"Wireless network","level":3,"score":0.11320069432258606},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/2304576.2304625","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304625","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.259.5355","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.259.5355","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.netlib.org/utk/people/JackDongarra/PAPERS/ics2012-enable-gpu.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.715.3212","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.715.3212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://icl.cs.utk.edu/news_pub/submissions/ics59-song.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.4399999976158142,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W94439627","https://openalex.org/W988231918","https://openalex.org/W1483880184","https://openalex.org/W1842923156","https://openalex.org/W1863162408","https://openalex.org/W1966592804","https://openalex.org/W1969617278","https://openalex.org/W1990893960","https://openalex.org/W2000335122","https://openalex.org/W2010747199","https://openalex.org/W2018338919","https://openalex.org/W2025715322","https://openalex.org/W2047614366","https://openalex.org/W2050612421","https://openalex.org/W2065102625","https://openalex.org/W2065392434","https://openalex.org/W2068822791","https://openalex.org/W2121082877","https://openalex.org/W2121893797","https://openalex.org/W2122619936","https://openalex.org/W2123031400","https://openalex.org/W2124480634","https://openalex.org/W2139116943","https://openalex.org/W2142421493","https://openalex.org/W2145960411","https://openalex.org/W2150476673","https://openalex.org/W2153439050","https://openalex.org/W2154750565","https://openalex.org/W2157237396","https://openalex.org/W2170611190","https://openalex.org/W2567731932","https://openalex.org/W3103849684","https://openalex.org/W4231150350","https://openalex.org/W4241224545"],"related_works":["https://openalex.org/W2403987929","https://openalex.org/W2039814159","https://openalex.org/W2374847384","https://openalex.org/W2063512590","https://openalex.org/W2107649022","https://openalex.org/W2356488190","https://openalex.org/W2966537581","https://openalex.org/W2079508979","https://openalex.org/W1970284238","https://openalex.org/W4289143641"],"abstract_inverted_index":{"We":[0,60,91],"present":[1],"a":[2,32,36,41,95,116,131],"new":[3,96],"approach":[4,109],"to":[5,19,48,51,57,67,77,83,102,150],"utilizing":[6],"all":[7,11],"CPU":[8],"cores":[9],"and":[10,16,39,54,72,88,99,105,125,140,164],"GPUs":[12,56],"on":[13,130,148,154],"heterogeneous":[14,33,42,62],"multicore":[15],"multi-GPU":[17],"systems":[18],"support":[20],"dense":[21],"matrix":[22],"computations":[23],"efficiently.":[24],"The":[25],"main":[26],"idea":[27],"is":[28,110],"that":[29],"we":[30],"treat":[31],"system":[34,98],"as":[35,145,147],"distributed-memory":[37],"machine,":[38],"use":[40],"multi-level":[43],"block":[44],"cyclic":[45],"distribution":[46],"method":[47,76],"allocate":[49],"data":[50],"the":[52,69,79,103,155],"host":[53],"multiple":[55],"minimize":[58],"communication.":[59],"design":[61],"algorithms":[63],"with":[64],"hybrid":[65,80],"tiles":[66],"accommodate":[68],"processor":[70],"heterogeneity,":[71],"introduce":[73],"an":[74],"auto-tuning":[75],"determine":[78],"tile":[81],"sizes":[82],"attain":[84],"both":[85],"high":[86,117],"performance":[87],"load":[89,126,162],"balancing.":[90,127],"have":[92],"also":[93],"implemented":[94],"runtime":[97],"applied":[100],"it":[101],"Cholesky":[104],"QR":[106],"factorizations.":[107],"Our":[108,128],"designed":[111],"for":[112],"achieving":[113],"four":[114],"objectives:":[115],"degree":[118],"of":[119,166],"parallelism,":[120],"minimized":[121,123],"synchronization,":[122],"communication,":[124],"experiments":[129],"compute":[132,152],"node":[133],"(with":[134],"two":[135],"Intel":[136],"Westmere":[137],"hexa-core":[138],"CPUs":[139],"three":[141],"Nvidia":[142],"Fermi":[143],"GPUs),":[144],"well":[146],"up":[149],"100":[151],"nodes":[153],"Keeneland":[156],"system,":[157],"demonstrate":[158],"great":[159],"scalability,":[160],"good":[161],"balancing,":[163],"efficiency":[165],"our":[167],"approach.":[168]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":5},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":2},{"year":2018,"cited_by_count":6},{"year":2017,"cited_by_count":4},{"year":2016,"cited_by_count":12},{"year":2015,"cited_by_count":15},{"year":2014,"cited_by_count":20},{"year":2013,"cited_by_count":10},{"year":2012,"cited_by_count":3}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
