{"id":"https://openalex.org/W2161042056","doi":"https://doi.org/10.1109/clustr.2009.5289124","title":"Performance analysis of memory transfers and GEMM subroutines on NVIDIA Tesla GPU cluster","display_name":"Performance analysis of memory transfers and GEMM subroutines on NVIDIA Tesla GPU cluster","publication_year":2009,"publication_date":"2009-01-01","ids":{"openalex":"https://openalex.org/W2161042056","doi":"https://doi.org/10.1109/clustr.2009.5289124","mag":"2161042056"},"language":"en","primary_location":{"id":"doi:10.1109/clustr.2009.5289124","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clustr.2009.5289124","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE International Conference on Cluster Computing and Workshops","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069250860","display_name":"Veerendra Allada","orcid":null},"institutions":[{"id":"https://openalex.org/I173911158","display_name":"Iowa State University","ror":"https://ror.org/04rswrd78","country_code":"US","type":"education","lineage":["https://openalex.org/I173911158"]},{"id":"https://openalex.org/I2802789608","display_name":"Ames National Laboratory","ror":"https://ror.org/041m9xr71","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I173911158","https://openalex.org/I2802789608","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Veerendra Allada","raw_affiliation_strings":["Electrical and Computer Engineering, Ames Laboratory, Iowa State University, Ames, IA, USA"],"affiliations":[{"raw_affiliation_string":"Electrical and Computer Engineering, Ames Laboratory, Iowa State University, Ames, IA, USA","institution_ids":["https://openalex.org/I173911158","https://openalex.org/I2802789608"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086036127","display_name":"Troy Benjegerdes","orcid":null},"institutions":[{"id":"https://openalex.org/I2802789608","display_name":"Ames National Laboratory","ror":"https://ror.org/041m9xr71","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I173911158","https://openalex.org/I2802789608","https://openalex.org/I39565521"]},{"id":"https://openalex.org/I173911158","display_name":"Iowa State University","ror":"https://ror.org/04rswrd78","country_code":"US","type":"education","lineage":["https://openalex.org/I173911158"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Troy Benjegerdes","raw_affiliation_strings":["Electrical and Computer Engineering, Ames Laboratory, Iowa State University, Ames, IA, USA"],"affiliations":[{"raw_affiliation_string":"Electrical and Computer Engineering, Ames Laboratory, Iowa State University, Ames, IA, USA","institution_ids":["https://openalex.org/I173911158","https://openalex.org/I2802789608"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5062604151","display_name":"Brett Bode","orcid":"https://orcid.org/0000-0002-4202-1024"},"institutions":[{"id":"https://openalex.org/I4210135837","display_name":"National Center for Supercomputing Applications","ror":"https://ror.org/03r10zj06","country_code":"US","type":"facility","lineage":["https://openalex.org/I157725225","https://openalex.org/I4210135837"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Brett Bode","raw_affiliation_strings":["National Center for Supercomputing Applications, Urbana-Champaign, IL, USA"],"affiliations":[{"raw_affiliation_string":"National Center for Supercomputing Applications, Urbana-Champaign, IL, USA","institution_ids":["https://openalex.org/I4210135837"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5069250860"],"corresponding_institution_ids":["https://openalex.org/I173911158","https://openalex.org/I2802789608"],"apc_list":null,"apc_paid":null,"fwci":1.8467,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.86808091,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8488351702690125},{"id":"https://openalex.org/keywords/subroutine","display_name":"Subroutine","score":0.8212897777557373},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.814292311668396},{"id":"https://openalex.org/keywords/xeon-phi","display_name":"Xeon Phi","score":0.5828959345817566},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5021970272064209},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.4903135597705841},{"id":"https://openalex.org/keywords/xeon","display_name":"Xeon","score":0.48427075147628784},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.4559992849826813},{"id":"https://openalex.org/keywords/graphics-processing-unit","display_name":"Graphics processing unit","score":0.44640278816223145},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.4397865831851959},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.23707139492034912},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09387889504432678}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8488351702690125},{"id":"https://openalex.org/C96147967","wikidata":"https://www.wikidata.org/wiki/Q190686","display_name":"Subroutine","level":2,"score":0.8212897777557373},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.814292311668396},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.5828959345817566},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5021970272064209},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.4903135597705841},{"id":"https://openalex.org/C145108525","wikidata":"https://www.wikidata.org/wiki/Q656154","display_name":"Xeon","level":2,"score":0.48427075147628784},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4559992849826813},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.44640278816223145},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.4397865831851959},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.23707139492034912},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09387889504432678},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/clustr.2009.5289124","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clustr.2009.5289124","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2009 IEEE International Conference on Cluster Computing and Workshops","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.47999998927116394,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1555915743","https://openalex.org/W1988425770","https://openalex.org/W1992851788","https://openalex.org/W2028499920","https://openalex.org/W2032309817","https://openalex.org/W2059807497","https://openalex.org/W2063186542","https://openalex.org/W2090267299","https://openalex.org/W2105528986","https://openalex.org/W2108157916","https://openalex.org/W2136150370","https://openalex.org/W2141579716","https://openalex.org/W2146930501","https://openalex.org/W2155503253","https://openalex.org/W2169150396","https://openalex.org/W2174937805","https://openalex.org/W2295862081","https://openalex.org/W2725179571","https://openalex.org/W3138798301","https://openalex.org/W3141650078","https://openalex.org/W4241513866","https://openalex.org/W4250981202","https://openalex.org/W6680965528"],"related_works":["https://openalex.org/W2475524688","https://openalex.org/W2739740241","https://openalex.org/W2085105049","https://openalex.org/W2592417500","https://openalex.org/W1974923383","https://openalex.org/W2526069705","https://openalex.org/W2024016913","https://openalex.org/W2019153376","https://openalex.org/W2981664121","https://openalex.org/W2796552083"],"abstract_inverted_index":{"Commodity":[0],"clusters":[1],"augmented":[2],"with":[3,19,182,209],"application":[4,37],"accelerators":[5],"are":[6,117,175],"evolving":[7],"as":[8,98],"competitive":[9],"high":[10,22],"performance":[11,26,108,160],"computing":[12],"systems.":[13],"The":[14,159,177,200],"Graphical":[15],"Processing":[16],"Unit":[17],"(GPU)":[18],"a":[20,31,132,204],"very":[21],"arithmetic":[23],"density":[24],"and":[25,58,113,146,155,164],"per":[27],"price":[28],"ratio":[29],"is":[30,96,140,203],"good":[32],"platform":[33],"for":[34],"the":[35,42,46,50,56,59,69,73,83,91,99,107,110,121,126,136,144,149,153,156,162,170,185,189,197],"scientific":[36],"acceleration.":[38],"In":[39,102],"addition":[40],"to":[41,63,67,119,125,142,195],"interconnect":[43],"bottlenecks":[44],"among":[45,89],"cluster":[47,207],"compute":[48],"nodes,":[49],"cost":[51],"of":[52,72,82,109,148,161,184],"memory":[53,111,150],"copies":[54,112,151],"between":[55,152],"host":[57,154],"GPU":[60,127,157],"device":[61],"have":[62,179],"be":[64],"carefully":[65],"amortized":[66],"improve":[68],"overall":[70],"efficiency":[71],"application.":[74],"Scientific":[75],"applications":[76],"also":[77],"rely":[78],"on":[79,135],"efficient":[80],"implementation":[81],"Basic":[84],"Linear":[85],"Algebra":[86],"Subroutines":[87],"(BLAS),":[88],"which":[90],"General":[92],"Matrix":[93],"Multiply":[94],"(GEMM)":[95],"considered":[97],"workhorse":[100],"subroutine.":[101],"this":[103],"paper,":[104],"we":[105],"study":[106],"GEMM":[114,167],"subroutines":[115,168],"that":[116,130,183],"crucial":[118],"port":[120],"computational":[122,198],"chemistry":[123],"algorithms":[124],"clusters.":[128],"To":[129],"end,":[131],"benchmark":[133],"based":[134],"NetPIPE":[137],"[1]":[138],"framework":[139],"developed":[141],"evaluate":[143],"latency":[145],"bandwidth":[147],"device.":[158],"single":[163],"double":[165],"precision":[166],"from":[169,188],"NVIDIA":[171,210],"CUBLAS":[172],"2.0":[173],"library":[174],"studied.":[176],"results":[178],"been":[180],"compared":[181],"BLAS":[186],"routines":[187],"Intel":[190,205],"Math":[191],"Kernel":[192],"Library":[193],"(MKL)":[194],"understand":[196],"trade-offs.":[199],"test":[201],"bed":[202],"Xeon":[206],"equipped":[208],"Tesla":[211],"GPUs.":[212]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2020,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2016,"cited_by_count":1},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
