{"id":"https://openalex.org/W2416378570","doi":"https://doi.org/10.1142/s0129626416500079","title":"Achieving Native GPU Performance for Out-of-Card Large Dense Matrix Multiplication","display_name":"Achieving Native GPU Performance for Out-of-Card Large Dense Matrix Multiplication","publication_year":2016,"publication_date":"2016-06-01","ids":{"openalex":"https://openalex.org/W2416378570","doi":"https://doi.org/10.1142/s0129626416500079","mag":"2416378570"},"language":"en","primary_location":{"id":"doi:10.1142/s0129626416500079","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0129626416500079","pdf_url":null,"source":{"id":"https://openalex.org/S18360026","display_name":"Parallel Processing Letters","issn_l":"0129-6264","issn":["0129-6264","1793-642X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319815","host_organization_name":"World Scientific","host_organization_lineage":["https://openalex.org/P4310319815"],"host_organization_lineage_names":["World Scientific"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Parallel Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5067238717","display_name":"Jing Wu","orcid":"https://orcid.org/0009-0004-7049-5480"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jing Wu","raw_affiliation_strings":["Department of Electrical and Computer Engineering and Institute for Advanced Computer Studies, University of Maryland, College Park, Maryland 20742, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering and Institute for Advanced Computer Studies, University of Maryland, College Park, Maryland 20742, USA","institution_ids":["https://openalex.org/I66946132"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085648933","display_name":"Joseph F. J\u00e1J\u00e1","orcid":"https://orcid.org/0000-0002-8620-5650"},"institutions":[{"id":"https://openalex.org/I66946132","display_name":"University of Maryland, College Park","ror":"https://ror.org/047s2c258","country_code":"US","type":"education","lineage":["https://openalex.org/I66946132"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Joseph Jaja","raw_affiliation_strings":["Department of Electrical and Computer Engineering and Institute for Advanced Computer Studies, University of Maryland, College Park, Maryland 20742, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering and Institute for Advanced Computer Studies, University of Maryland, College Park, Maryland 20742, USA","institution_ids":["https://openalex.org/I66946132"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5067238717"],"corresponding_institution_ids":["https://openalex.org/I66946132"],"apc_list":null,"apc_paid":null,"fwci":1.281,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.78512246,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":"26","issue":"02","first_page":"1650007","last_page":"1650007"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8208946585655212},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8042033910751343},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.7548039555549622},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.7005773782730103},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.590701699256897},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.5609651207923889},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.553987443447113},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5398115515708923},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5169955492019653},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.47027164697647095},{"id":"https://openalex.org/keywords/node","display_name":"Node (physics)","score":0.452056348323822},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.44846591353416443},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.39735090732574463},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.303758442401886},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.2651822865009308},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.252483069896698},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.18087711930274963},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.1782398819923401}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8208946585655212},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8042033910751343},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.7548039555549622},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.7005773782730103},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.590701699256897},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.5609651207923889},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.553987443447113},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5398115515708923},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5169955492019653},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.47027164697647095},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.452056348323822},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.44846591353416443},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.39735090732574463},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.303758442401886},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.2651822865009308},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.252483069896698},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.18087711930274963},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.1782398819923401},{"id":"https://openalex.org/C66938386","wikidata":"https://www.wikidata.org/wiki/Q633538","display_name":"Structural engineering","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1142/s0129626416500079","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0129626416500079","pdf_url":null,"source":{"id":"https://openalex.org/S18360026","display_name":"Parallel Processing Letters","issn_l":"0129-6264","issn":["0129-6264","1793-642X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319815","host_organization_name":"World Scientific","host_organization_lineage":["https://openalex.org/P4310319815"],"host_organization_lineage_names":["World Scientific"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Parallel Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.49000000953674316,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W2073061372"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2080146221","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2792081825"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,38,95],"illustrate":[4],"the":[5,30,33,52,55,63,75],"possibility":[6],"of":[7,32,77],"developing":[8],"strategies":[9],"to":[10,29,60,98],"carry":[11],"out":[12],"matrix":[13,42],"computations":[14],"on":[15,23,45,62,106],"heterogeneous":[16,47],"platforms":[17],"which":[18,66],"achieve":[19,99],"native":[20],"GPU":[21,70],"performance":[22,105],"very":[24],"large":[25,59],"data":[26,87],"sizes":[27],"up":[28],"capacity":[31],"CPU":[34],"memory.":[35],"More":[36],"specifically,":[37],"present":[39],"a":[40,46,93,107],"dense":[41],"multiplication":[43],"strategy":[44,73],"platform,":[48],"specifically":[49],"tailored":[50],"for":[51],"case":[53],"when":[54],"input":[56],"is":[57],"too":[58],"fit":[61],"device":[64],"memory,":[65],"achieves":[67],"near":[68],"peak":[69],"performance.":[71],"Our":[72],"involves":[74],"development":[76],"CUDA":[78],"stream":[79],"based":[80],"software":[81],"pipelines":[82],"that":[83],"effectively":[84],"overlap":[85],"PCIe":[86],"transfers":[88],"with":[89],"kernel":[90],"executions.":[91],"As":[92],"result,":[94],"are":[96],"able":[97],"over":[100],"1":[101,111],"and":[102,112],"2":[103,113],"TFLOPS":[104],"single":[108],"node":[109],"using":[110],"GPUs":[114],"respectively.":[115]},"counts_by_year":[{"year":2020,"cited_by_count":1},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":1},{"year":2016,"cited_by_count":1}],"updated_date":"2026-05-21T09:19:25.381259","created_date":"2025-10-10T00:00:00"}
